# Data Preprocessing and Classification

This file starts with the data images file saved from an iphone 15 pro, which is capable of of taking 'portrait' photos which include depth map information.

We generate a review file containing a list of all the images.  

We generate a mapping file.  The file list is used to generate a mapping file to rename and effectively label the images.  Note that the images were taken in a specific order to aid labelling.



The renamed files are then sorted into relevant folders dependent on their name content.  Some explanation:

gy_      greyscale, 1-dimensional/channel black and white photos
or_      original, 3-dimensional/channel colour (red green blue, RGB) photos
fc_      four-channel; 4-dimensional/channel consisting RGB with an additional depth map layer



CLASSIFICATIONS

There are 4 classification models in the data

country
piece
exact_piece
force


LOCALISATION

This will mainly relate to the locations of the folders and files 

# Part 1 : Summary List of image files

The following creates an excel which summarises the content of the image file folder.  This file is used to analyse the image files.

OFFLINE a mapping file of current_name/target_name is generated to copy/rename files in part 3.

LOCALISATION

directory_path   this is the target location where all the images from the phone are located

In [None]:
import os
import pandas as pd
from datetime import datetime

def create_file_list(directory):
    # Path to the directory where the Excel file will be saved
    subfolder_path = os.path.join(directory, 'FileList')
    
    # Create the subfolder if it does not exist
    if not os.path.exists(subfolder_path):
        os.makedirs(subfolder_path)
    
    # List to store file details
    file_details = []
    
    # Walk through the directory
    for root, dirs, files in os.walk(directory):
        for file in files:
            filepath = os.path.join(root, file)
            # Get the file modification time
            mod_time = os.path.getmtime(filepath)
            # Convert the modification time to a human-readable format
            mod_time = datetime.fromtimestamp(mod_time).strftime('%Y-%m-%d %H:%M:%S')
            # Get relative path of file
            relative_path = os.path.relpath(root, directory)
            # Get file extension, if file has one
            extension = os.path.splitext(file)[1] if os.path.splitext(file)[1] else 'No extension'
            # Append the details to the list
            file_details.append({
                'Filename': file,
                'Folder': relative_path,
                'Modification Date': mod_time,
                'Type': extension
            })
    
    # Create a DataFrame
    df = pd.DataFrame(file_details)
    
    # Current timestamp for file naming
    current_time = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # Save to Excel with timestamp
    excel_path = os.path.join(subfolder_path, f'file_list_{current_time}.xlsx')
    df.to_excel(excel_path, index=False)
    print(f'Excel file has been saved to {excel_path}')

# Specify the directory path
directory_path = r'C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\00.Imported_All_Files\202407__'
create_file_list(directory_path)

# run to generate :
Excel file has been saved to C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\00.Imported_All_Files\202407__\FileList\file_list_20240729_125635.xlsx

# Part 2: collect only the jpg files

There are 4 formats of image file per photo, jpg (original), jpg (enhanced), 2 x aae files.  We need the original jpg, copied to new folder for repeatability of process.

LOCALISATION

source_directory            this is the target location where all the images from the phone are located
destination_directory       this is the destination location folder for the jpg only files

In [None]:
import os
import shutil
import re

def copy_filtered_images(source_dir, dest_dir):
    # Regex to match files like IMG_####.JPG
    file_pattern = re.compile(r'^IMG_\d{4}\.JPG$', re.IGNORECASE)

    # Counter for copied files
    copied_files_count = 0

    # Ensure the destination directory exists, if not, create it
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)

    # List all files in the source directory (only the root, ignoring subdirectories)
    for filename in os.listdir(source_dir):
        # Full path of the file
        file_path = os.path.join(source_dir, filename)

        # Check if it is a file and matches the desired pattern
        if os.path.isfile(file_path) and file_pattern.match(filename):
            # Construct the destination path
            dest_path = os.path.join(dest_dir, filename)
            # Copy the file to the destination directory
            shutil.copy(file_path, dest_path)
            copied_files_count += 1

    return copied_files_count

# Source and destination directories
source_directory = r'C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\00.Imported_All_Files\202407__'
destination_directory = r'C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\01.Imported_jpg_Files'

# Perform the copy operation
copied_count = copy_filtered_images(source_directory, destination_directory)
print(f'Total files copied: {copied_count}')

Total files copied: 1530

# Part 3:  ReName and ReLabel

This copies and renames/labels the original jpg files.
This uses a mapping file generated from the list in part 1.

LOCALISATION

source_directory         this is where the unlabelled jpg files are

destination_directory    this is where the labelled jpg files will go

excel_file_path          this is the exact file used for file mapping


In [None]:
import os
import pandas as pd

def rename_and_move_files(source_dir, dest_dir, excel_path):
    # Read the Excel file into a DataFrame
    df = pd.read_excel(excel_path)

    # Create the destination directory if it does not exist
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)

    # Rename and move the files based on the mapping in the DataFrame
    renamed_files_count = 0
    for index, row in df.iterrows():
        current_filename = row['Current Filename']
        new_filename = row['New Filename']
        current_file_path = os.path.join(source_dir, current_filename)
        
        # Check if the file exists and new filename is provided
        if os.path.exists(current_file_path) and pd.notna(new_filename):
            new_file_path = os.path.join(dest_dir, new_filename)
            # Move and rename the file
            os.rename(current_file_path, new_file_path)
            renamed_files_count += 1

    return renamed_files_count

# Directory containing the files
source_directory = r'C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\01.Imported_jpg_Files'
# Destination directory for renamed files
destination_directory = r'C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\02.Labelled_jpg'
# Path to the Excel file with the renaming mapping
excel_file_path = r'C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\00.Imported_All_Files\202407__\FileList\file_mapping.xlsx'

# Perform the renaming and moving operation
renamed_count = rename_and_move_files(source_directory, destination_directory, excel_file_path)
print(f'Total files renamed and moved: {renamed_count}')

Total files renamed and moved: 1530

# Part 4 : Extract and unify layers

Extract all the layers and unify as 4284h x 5712w (blow up depth map)

Extraction uses exiftool

All jpg files converted to png as jpg cannot handle >3 channels

Resizing to blow up the depth map layer to the size of the other layers (4284 x 5712) to allow it to be treated the same as other images.

More than just the later processed images are produced:
    image                   original
    image2                  scaled down original (assumed used for thumbnails)
    image3                  single depth map
    blue_channel            single blue channel
    green_channel           single green channel
    red_channel             single red channel
    grayscale_image         original greyscale image
    upscaled_image3         blown-up / upscaled_image3 (depth map)
    four channel image      combination of original and upscaled_image3

WARNING

This code, with the displayed images in the log, take 4+ hours to run

LOCALISATION

source_directory             where the labelled jpg are
destination_directory        where the extracted files will go


# WARNING take 4 + hours to run if you display the images as you go.

In [None]:
import cv2
import numpy as np
from PIL import Image
import piexif
import matplotlib.pyplot as plt
import os
import time
import subprocess

# Define source and destination directories
source_directory = r'C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\02.Labelled_jpg'
destination_directory = r'C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\03.ExtractedLayers'
os.makedirs(destination_directory, exist_ok=True)

# Define the target size for the basic channels
TARGET_SIZE = (4284, 5712)

def run_exiftool(image_path, output_dir, timestamp, base_name):
    print("Extracting MP images using exiftool...")
    mp_image2_path = os.path.join(output_dir, f"{timestamp}_{base_name}_mp_image2.png")
    mp_image3_path = os.path.join(output_dir, f"{timestamp}_{base_name}_mp_image3.png")
    
    try:
        # Extract MPImage2
        result = subprocess.run(["exiftool", "-b", "-MPImage2", image_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
        with open(mp_image2_path, 'wb') as f:
            f.write(result.stdout)
        print(f"MPImage2 saved to {mp_image2_path}")
        
        # Extract MPImage3
        result = subprocess.run(["exiftool", "-b", "-MPImage3", image_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
        with open(mp_image3_path, 'wb') as f:
            f.write(result.stdout)
        print(f"MPImage3 saved to {mp_image3_path}")

        # Check if files were created
        if not os.path.exists(mp_image2_path) or not os.path.exists(mp_image3_path):
            raise FileNotFoundError("Extracted MP images not found.")

    except subprocess.CalledProcessError as e:
        print(f"Error during exiftool extraction: {e.stderr.decode()}")
        return None, None
    except FileNotFoundError as e:
        print(f"Error: {str(e)}")
        return None, None
    
    return mp_image2_path, mp_image3_path

def load_image(image_path):
    print("Loading image...")
    start_time = time.time()
    image = Image.open(image_path)
    end_time = time.time()
    print(f"Loading image took {end_time - start_time:.2f} seconds")
    return image

def check_orientation(image):
    print("Checking and correcting image orientation...")
    start_time = time.time()
    try:
        exif_data = piexif.load(image.info['exif'])
        orientation = exif_data['0th'][piexif.ImageIFD.Orientation]
        if orientation == 3:
            image = image.rotate(180, expand=True)
        elif orientation == 6:
            image = image.rotate(270, expand=True)
        elif orientation == 8:
            image = image.rotate(90, expand=True)
    except (KeyError, AttributeError):
        pass
    end_time = time.time()
    print(f"Orientation check took {end_time - start_time:.2f} seconds")
    return image

def correct_orientation_opencv(image, orientation):
    if orientation == 3:
        image = cv2.rotate(image, cv2.ROTATE_180)
    elif orientation == 6:
        image = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
    elif orientation == 8:
        image = cv2.rotate(image, cv2.ROTATE_90_COUNTERCLOCKWISE)
    return image

def convert_to_opencv(image):
    print("Converting image to OpenCV format...")
    start_time = time.time()
    image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
    end_time = time.time()
    print(f"Conversion took {end_time - start_time:.2f} seconds")
    return image_cv

def extract_rgb_channels(image_cv):
    print("Extracting RGB channels...")
    start_time = time.time()
    blue_channel, green_channel, red_channel = cv2.split(image_cv)
    end_time = time.time()
    print(f"RGB extraction took {end_time - start_time:.2f} seconds")
    return blue_channel, green_channel, red_channel

def upscale_image(image, target_size):
    print("Upscaling image...")
    start_time = time.time()
    upscaled_image = cv2.resize(image, target_size, interpolation=cv2.INTER_CUBIC)
    end_time = time.time()
    print(f"Upscaling took {end_time - start_time:.2f} seconds")
    return upscaled_image

def normalize_image(image):
    return cv2.normalize(image, None, 0, 255, cv2.NORM_MINMAX)

def report_image_sizes(images):
    base_height, base_width = images[0].shape[:2]
    for i, img in enumerate(images):
        height, width = img.shape[:2]
        if height != base_height or width != base_width:
            print(f"Image {i} has shape {img.shape[:2]}, expected ({base_height}, {base_width})")
    return base_height, base_width

def save_images(image_cv, image2, image3, blue_channel, green_channel, red_channel, grayscale_image, upscaled_image3, original_filename, save_directory, timestamp):
    start_time = time.time()
    print("Saving images...")
    base_name = os.path.splitext(original_filename)[0]

    # Create filenames with timestamp
    filenames = [
        f'{timestamp}_{base_name}_original.png',
        f'{timestamp}_{base_name}_image2.png',
        f'{timestamp}_{base_name}_image3.png',
        f'{timestamp}_{base_name}_blue_channel.png',
        f'{timestamp}_{base_name}_green_channel.png',
        f'{timestamp}_{base_name}_red_channel.png',
        f'{timestamp}_{base_name}_grayscale.png',
        f'{timestamp}_{base_name}_upscaled_image3.png'
    ]

    # Save the images with the new filenames
    cv2.imwrite(os.path.join(save_directory, filenames[0]), image_cv)
    cv2.imwrite(os.path.join(save_directory, filenames[1]), image2)
    cv2.imwrite(os.path.join(save_directory, filenames[2]), image3)
    cv2.imwrite(os.path.join(save_directory, filenames[3]), blue_channel)
    cv2.imwrite(os.path.join(save_directory, filenames[4]), green_channel)
    cv2.imwrite(os.path.join(save_directory, filenames[5]), red_channel)
    cv2.imwrite(os.path.join(save_directory, filenames[6]), grayscale_image)
    cv2.imwrite(os.path.join(save_directory, filenames[7]), upscaled_image3)

    end_time = time.time()
    print(f"Saving images took {end_time - start_time:.2f} seconds")

def display_images(image_cv, image2, image3, blue_channel, green_channel, red_channel, grayscale_image, upscaled_image3):
    start_time = time.time()
    print("Displaying images...")
    fig, axes = plt.subplots(2, 4, figsize=(20, 10))
    
    axes[0, 0].imshow(cv2.cvtColor(image_cv, cv2.COLOR_BGR2RGB))
    axes[0, 0].set_title('Original Image')
    axes[0, 0].axis('off')
    
    axes[0, 1].imshow(image2, cmap='gray')
    axes[0, 1].set_title('Image 2')
    axes[0, 1].axis('off')
    
    axes[0, 2].imshow(image3, cmap='gray')
    axes[0, 2].set_title('Image 3 (Depth Map)')
    axes[0, 2].axis('off')
    
    axes[0, 3].imshow(grayscale_image, cmap='gray')
    axes[0, 3].set_title('Grayscale Image')
    axes[0, 3].axis('off')
    
    axes[1, 0].imshow(blue_channel, cmap='gray')
    axes[1, 0].set_title('Blue Channel')
    axes[1, 0].axis('off')
    
    axes[1, 1].imshow(green_channel, cmap='gray')
    axes[1, 1].set_title('Green Channel')
    axes[1, 1].axis('off')
    
    axes[1, 2].imshow(red_channel, cmap='gray')
    axes[1, 2].set_title('Red Channel')
    axes[1, 2].axis('off')
    
    axes[1, 3].imshow(upscaled_image3, cmap='gray')
    axes[1, 3].set_title('Upscaled Image 3')
    axes[1, 3].axis('off')
    
    plt.tight_layout()
    plt.show()
    end_time = time.time()
    print(f"Displaying images took {end_time - start_time:.2f} seconds")

def merge_channels_to_4_channel(image_cv, depth_map):
    print("Merging channels to create a 4-channel image...")
    start_time = time.time()
    # Convert the RGB image to a numpy array
    rgb_array = np.array(image_cv)
    
    # Ensure the depth map has the same height and width as the RGB image
    if rgb_array.shape[:2] != depth_map.shape[:2]:
        raise ValueError("RGB image and depth map must have the same dimensions")

    # Normalize all channels
    rgb_array = normalize_image(rgb_array)
    depth_map = normalize_image(depth_map)

    # Expand the depth map to match the shape of the RGB array (height, width, 1)
    depth_map_expanded = np.expand_dims(depth_map, axis=2)
    
    # Concatenate the RGB array with the depth map to form a 4-channel image
    four_channel_image = np.concatenate((rgb_array, depth_map_expanded), axis=2)
    
    end_time = time.time()
    print(f"Merging channels took {end_time - start_time:.2f} seconds")
    return four_channel_image

def save_four_channel_image(four_channel_image, original_filename, save_directory, timestamp):
    print("Saving the 4-channel image...")
    base_name = os.path.splitext(original_filename)[0]
    filename = f'{timestamp}_{base_name}_four_channel.png'
    save_path = os.path.join(save_directory, filename)
    cv2.imwrite(save_path, four_channel_image)
    print(f"4-channel image saved to {save_path}")

def main(image_path, save_directory):
    total_start_time = time.time()
    print(f"Processing image: {image_path}")
    
    # Get the current timestamp
    timestamp = time.strftime("%Y%m%d%H%M%S")
    
    # Extract base name of the file
    base_name = os.path.splitext(os.path.basename(image_path))[0]
    
    # Step 1: Extract MP images using exiftool
    image2_path, image3_path = run_exiftool(image_path, save_directory, timestamp, base_name)
    if not image2_path or not image3_path:
        print("Error: Could not extract MP images.")
        return

    # Step 2: Load and process the main image
    image = load_image(image_path)
    image = check_orientation(image)
    image_cv = convert_to_opencv(image)
    
    # Step 3: Load image2 and image3
    image2 = cv2.imread(image2_path, cv2.IMREAD_GRAYSCALE)
    image3 = cv2.imread(image3_path, cv2.IMREAD_GRAYSCALE)
    
    if image2 is None or image3 is None:
        print("Error: Could not load one or both additional images.")
        return
    
    # Get the orientation from the original image
    exif_data = piexif.load(image.info['exif'])
    orientation = exif_data['0th'].get(piexif.ImageIFD.Orientation, 1)
    
    # Correct orientation for image2 and image3
    image2 = correct_orientation_opencv(image2, orientation)
    image3 = correct_orientation_opencv(image3, orientation)
    
    # Normalize image3 (depth map)
    image3_normalized = cv2.normalize(image3, None, 0, 255, cv2.NORM_MINMAX)
    
    blue_channel, green_channel, red_channel = extract_rgb_channels(image_cv)
    
    # Convert the original image to grayscale
    grayscale_image = cv2.cvtColor(image_cv, cv2.COLOR_BGR2GRAY)

    # Check the resolution of the depth map and upscale if necessary
    if image3.shape != TARGET_SIZE:
        print(f"Depth map size {image3.shape} does not match target size {TARGET_SIZE}. Upscaling required.")
        upscaled_image3 = upscale_image(image3, TARGET_SIZE)
    else:
        upscaled_image3 = image3

    # Print shapes for debugging
    print(f"Original image shape: {image_cv.shape}")
    print(f"Image 2 shape: {image2.shape}")
    print(f"Image 3 shape: {image3.shape}")
    print(f"Normalized Image 3 shape: {image3_normalized.shape}")
    print(f"Blue channel shape: {blue_channel.shape}")
    print(f"Green channel shape: {green_channel.shape}")
    print(f"Red channel shape: {red_channel.shape}")
    print(f"Grayscale image shape: {grayscale_image.shape}")
    print(f"Upscaled Image 3 shape: {upscaled_image3.shape}")
    
    # Report all image sizes without raising an error
    report_image_sizes([image_cv, image2, image3, blue_channel, green_channel, red_channel, grayscale_image, upscaled_image3])
    
    save_images(image_cv, image2, image3_normalized, blue_channel, green_channel, red_channel, grayscale_image, upscaled_image3, os.path.basename(image_path), save_directory, timestamp)
    display_images(image_cv, image2, image3_normalized, blue_channel, green_channel, red_channel, grayscale_image, upscaled_image3)
    
    # Create and save the 4-channel image
    four_channel_image = merge_channels_to_4_channel(image_cv, upscaled_image3)
    save_four_channel_image(four_channel_image, os.path.basename(image_path), save_directory, timestamp)
    
    total_end_time = time.time()
    print(f"Total processing time: {total_end_time - total_start_time:.2f} seconds")

def process_all_images_in_folder(folder_path, save_directory):
    for filename in os.listdir(folder_path):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(folder_path, filename)
            main(image_path, save_directory)

if __name__ == "__main__":
    process_all_images_in_folder(source_directory, destination_directory)

Processing image: C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\02.Labelled_jpg\11530_IMG_0666_509_500USA_09CAR_SEA_CARRIERXXX_2RGT_E_ORIG.JPG
Extracting MP images using exiftool...
MPImage2 saved to C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\03.ExtractedLayers\20240730000050_11530_IMG_0666_509_500USA_09CAR_SEA_CARRIERXXX_2RGT_E_ORIG_mp_image2.png
MPImage3 saved to C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\03.ExtractedLayers\20240730000050_11530_IMG_0666_509_500USA_09CAR_SEA_CARRIERXXX_2RGT_E_ORIG_mp_image3.png
Loading image...
Loading image took 0.00 seconds
Checking and correcting image orientation...
Orientation check took 0.20 seconds
Converting image to OpenCV format...
Conversion took 0.06 seconds
Extracting RGB channels...
RGB extraction took 0.03 seconds
Depth map size (768, 576) does not match target size (4284, 5712). Upscaling required.
Upscaling image...
Upscaling took 0.01 seconds
Original image shape: (5712, 4284, 3)
Image 2 shape: (2856, 2142)
Image 3 shape: (768, 576)
Normalized Image 3 shape: (768, 576)
Blue channel shape: (5712, 4284)
Green channel shape: (5712, 4284)
Red channel shape: (5712, 4284)
Grayscale image shape: (5712, 4284)
Upscaled Image 3 shape: (5712, 4284)
Image 1 has shape (2856, 2142), expected (5712, 4284)
Image 2 has shape (768, 576), expected (5712, 4284)
Saving images...
Saving images took 1.47 seconds
Displaying images...

Displaying images took 5.48 seconds
Merging channels to create a 4-channel image...
Merging channels took 0.14 seconds
Saving the 4-channel image...
4-channel image saved to C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\03.ExtractedLayers\20240730000050_11530_IMG_0666_509_500USA_09CAR_SEA_CARRIERXXX_2RGT_E_ORIG_four_channel.png
Total processing time: 8.75 seconds

# Part 5 : Resize (Crop) files and Select by Channel

This processes only greyscale, original and four_channel files.

All the photos were taken in the inner 1/9th frame in the camera.  It is therefore possible to crop every file into the middle part.  (imagine 2 vertical and 2 horizontal lines at 1/3 and 2/3 along each side framing each piece in the middle)


TIMING

This can take 2hrs 20 + (not multi-thread)


LOCALISATION

source_dir     where the extracted layers are

dest_dir       where the resized or cropped files will go.

# WARNING this can take 7806 seconds i.e.  2hr 10

In [None]:
import os
from PIL import Image
import time
from pathlib import Path

# Define directories
source_dir = Path(r"C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\03.ExtractedLayers")
dest_dir = Path(r"C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\04.ReSized")

# Create destination directory if it doesn't exist
dest_dir.mkdir(parents=True, exist_ok=True)

# Clear destination directory
for file in dest_dir.glob('*'):
    if file.is_file():
        file.unlink()

# File endings to process
file_endings = {
    "grayscale.png": "gs_",
    "original.png": "or_",
    "four_channel.png": "fc_"
}

# Function to process an image
def process_image(file_path, save_path):
    with Image.open(file_path) as img:
        width, height = img.size
        # Crop to the center third
        left = width // 3
        right = left * 2
        top = height // 3
        bottom = top * 2
        cropped_img = img.crop((left, top, right, bottom))
        # Save the processed image
        cropped_img.save(save_path)

# Function to get new filename
def get_new_filename(file_path, prefix, suffix="9th"):
    return f"{prefix}{file_path.stem}_{suffix}{file_path.suffix}"

# Start overall timing
overall_start_time = time.time()

# Dictionary to store timing for each file type
timings = {key: 0 for key in file_endings.keys()}

# Count files before processing
file_count_before = sum(1 for file in source_dir.rglob('*') if file.is_file() and any(file.name.endswith(ending) for ending in file_endings.keys()))

# Process each file
for file in source_dir.rglob('*'):
    if file.is_file():
        for ending, prefix in file_endings.items():
            if file.name.endswith(ending):
                new_filename = get_new_filename(file, prefix)
                new_file_path = dest_dir / new_filename
                
                start_time = time.time()
                process_image(file, new_file_path)
                end_time = time.time()
                timings[ending] += end_time - start_time

# Count files after processing
file_count_after = sum(1 for file in dest_dir.rglob('*') if file.is_file())

# End overall timing
overall_end_time = time.time()
overall_elapsed_time = overall_end_time - overall_start_time

# Output results
print(f"Overall time taken: {overall_elapsed_time:.2f} seconds")
print(f"Files before processing: {file_count_before}")
print(f"Files after processing: {file_count_after}")

# Output timing for each file type
for ending, time_taken in timings.items():
    print(f"Time taken for files ending with '{ending}': {time_taken:.2f} seconds")

# Additional checks
if file_count_before == file_count_after:
    print("All files were successfully processed and transferred.")
else:
    print("Some files may not have been processed correctly.")


Overall time taken: 7806.32 seconds
Files before processing: 4590
Files after processing: 4590
Time taken for files ending with 'grayscale.png': 4177.89 seconds
Time taken for files ending with 'original.png': 1684.26 seconds
Time taken for files ending with 'four_channel.png': 1899.27 seconds
All files were successfully processed and transferred.

# Part 6 : Generating classification Folders

This moves the relevant files into folders related to their data channel and then their classification type, with the files being placed in subfolders of the classification
e.g.
      06.DataSets_gy         >   country            >  100RUS
      image type=greyscale       classification        named class

There are 3 image types:
      greyscale
      original
      four-channel

There are 4 classifications:
      country            what 'country' is the piece?  This can be determined by the colour or sculpt.  (5 classes)
      force              is the type of the piece land, air or sea?  (3 classes)
      piece              which of the 9 piece types is it? (9 classes)
      exact_piece        what is the country - piece combination (45 classes)


LOCALISATION (in each of the subparts)

src_folder = folder where the cropped data went
dst_folders = {
    'fc': folder for four-channel
    'gs': folder for greyscale
    'or': folder for original
}



# Part 6.1 : split country classification

In [None]:
import os
import shutil
from collections import defaultdict
import logging

# Configure logging
logging.basicConfig(filename='file_copy.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define the source and destination directories
src_folder = r'C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\04.ReSized'
dst_folders = {
    'fc': r'C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\08.DataSets_fc',
    'gs': r'C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\06.DataSets_gy',
    'or': r'C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\07.DataSets_or'
}

# Country codes
country_codes = ['100RUS', '200GER', '300UK', '400JAP', '500USA']

# Create country subfolders in destination directories
for key, dst_folder in dst_folders.items():
    country_folder = os.path.join(dst_folder, 'country')
    os.makedirs(country_folder, exist_ok=True)
    for code in country_codes:
        os.makedirs(os.path.join(country_folder, code), exist_ok=True)

# Function to copy files to their respective directories
def copy_files():
    # Count the files to ensure correct distribution
    count_files = defaultdict(int)
    for file_name in os.listdir(src_folder):
        if file_name.endswith('.png'):
            prefix = file_name.split('_')[0]
            if prefix in dst_folders:
                parts = file_name.split('_')
                if len(parts) > 6:
                    country_code = parts[6]
                    if country_code in country_codes:
                        dst_folder = os.path.join(dst_folders[prefix], 'country', country_code)
                        src_file = os.path.join(src_folder, file_name)
                        dst_file = os.path.join(dst_folder, file_name)
                        try:
                            shutil.copy2(src_file, dst_file)
                            logging.info(f"Copied {src_file} to {dst_file}")
                            count_files[prefix] += 1
                        except Exception as e:
                            logging.error(f"Error copying {src_file} to {dst_file}: {e}")

    # Verify the counts
    for prefix, count in count_files.items():
        expected_count = 1530
        if count != expected_count:
            logging.warning(f"{prefix} files count mismatch. Expected {expected_count}, found {count}.")
        else:
            logging.info(f"All {prefix} files copied successfully: {count} files.")

if __name__ == "__main__":
    try:
        copy_files()
        logging.info("File copying completed successfully.")
    except Exception as e:
        logging.critical(f"An unexpected error occurred: {e}")


# Part 6.2 : Split into Force classification

In [None]:
import os
import shutil
from collections import defaultdict
import logging
import time

# Configure logging
logging.basicConfig(filename='file_copy.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define the source and destination directories
src_folder = r'C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\04.ReSized'
dst_folders = {
    'fc': r'C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\08.DataSets_fc',
    'gs': r'C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\06.DataSets_gy',
    'or': r'C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\07.DataSets_or'
}

# Force classifications
force_classifications = ['LND', 'SEA', 'AIR']

# Create force subfolders in destination directories
for key, dst_folder in dst_folders.items():
    force_folder = os.path.join(dst_folder, 'force')
    os.makedirs(force_folder, exist_ok=True)
    for classification in force_classifications:
        os.makedirs(os.path.join(force_folder, classification), exist_ok=True)

# Function to copy files to their respective directories
def copy_files():
    # Start the timer
    start_time = time.time()

    # Count the files to ensure correct distribution
    count_files = defaultdict(int)
    for file_name in os.listdir(src_folder):
        if file_name.endswith('.png'):
            prefix = file_name.split('_')[0]
            if prefix in dst_folders:
                parts = file_name.split('_')
                if len(parts) > 8:
                    force_class = parts[8]
                    if force_class in force_classifications:
                        dst_folder = os.path.join(dst_folders[prefix], 'force', force_class)
                        src_file = os.path.join(src_folder, file_name)
                        dst_file = os.path.join(dst_folder, file_name)
                        try:
                            shutil.copy2(src_file, dst_file)
                            logging.info(f"Copied {src_file} to {dst_file}")
                            count_files[prefix] += 1
                        except Exception as e:
                            logging.error(f"Error copying {src_file} to {dst_file}: {e}")

    # Verify the counts
    for prefix, count in count_files.items():
        expected_count = 1530
        if count != expected_count:
            logging.warning(f"{prefix} files count mismatch. Expected {expected_count}, found {count}.")
        else:
            logging.info(f"All {prefix} files copied successfully: {count} files.")

    # Stop the timer
    end_time = time.time()
    elapsed_time = end_time - start_time
    logging.info(f"File copying completed successfully in {elapsed_time:.2f} seconds.")
    print(f"File copying completed successfully in {elapsed_time:.2f} seconds.")

if __name__ == "__main__":
    try:
        copy_files()
        logging.info("File copying completed successfully.")
    except Exception as e:
        logging.critical(f"An unexpected error occurred: {e}")


File copying completed successfully in 12.31 seconds.

# Part 6.3 : Split into piece classification

In [None]:
import os
import shutil
from collections import defaultdict
import logging
import time

# Configure logging
logging.basicConfig(filename='file_copy.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define the source and destination directories
src_folder = r'C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\04.ReSized'
dst_folders = {
    'fc': r'C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\08.DataSets_fc',
    'gs': r'C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\06.DataSets_gy',
    'or': r'C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\07.DataSets_or'
}

# Piece classifications
piece_classifications = ['01INF', '02TNK', '03FGT', '04BMB', '05DES', '06TRS', '07SUB', '08BAT', '09CAR']

# Create piece subfolders in destination directories
for key, dst_folder in dst_folders.items():
    piece_folder = os.path.join(dst_folder, 'piece')
    os.makedirs(piece_folder, exist_ok=True)
    for classification in piece_classifications:
        os.makedirs(os.path.join(piece_folder, classification), exist_ok=True)

# Function to copy files to their respective directories
def copy_files():
    # Start the timer
    start_time = time.time()

    # Count the files to ensure correct distribution
    count_files = defaultdict(int)
    for file_name in os.listdir(src_folder):
        if file_name.endswith('.png'):
            prefix = file_name.split('_')[0]
            if prefix in dst_folders:
                parts = file_name.split('_')
                if len(parts) > 7:
                    piece_class = parts[7]
                    if piece_class in piece_classifications:
                        dst_folder = os.path.join(dst_folders[prefix], 'piece', piece_class)
                        src_file = os.path.join(src_folder, file_name)
                        dst_file = os.path.join(dst_folder, file_name)
                        try:
                            shutil.copy2(src_file, dst_file)
                            logging.info(f"Copied {src_file} to {dst_file}")
                            count_files[prefix] += 1
                        except Exception as e:
                            logging.error(f"Error copying {src_file} to {dst_file}: {e}")

    # Verify the counts
    for prefix, count in count_files.items():
        expected_count = 1530  # 1530 files per prefix, distributed among 9 classifications
        if count != expected_count:
            logging.warning(f"{prefix} files count mismatch. Expected {expected_count}, found {count}.")
        else:
            logging.info(f"All {prefix} files copied successfully: {count} files.")

    # Stop the timer
    end_time = time.time()
    elapsed_time = end_time - start_time
    logging.info(f"File copying completed successfully in {elapsed_time:.2f} seconds.")
    print(f"File copying completed successfully in {elapsed_time:.2f} seconds.")

if __name__ == "__main__":
    try:
        copy_files()
        logging.info("File copying completed successfully.")
    except Exception as e:
        logging.critical(f"An unexpected error occurred: {e}")


File copying completed successfully in 12.48 seconds.

# Part 6.4 : split into exact piece classification (piece and country)

In [None]:
import os
import shutil
from collections import defaultdict
import logging
import time

# Configure logging
logging.basicConfig(filename='file_copy.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define the source and destination directories
src_folder = r'C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\04.ReSized'
dst_folders = {
    'fc': r'C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\08.DataSets_fc',
    'gs': r'C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\06.DataSets_gy',
    'or': r'C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\07.DataSets_or'
}

# Exact piece classifications
exact_piece_classifications = [
    '101_100RUS_01INF', '102_100RUS_02TNK', '103_100RUS_03FGT', '104_100RUS_04BMB', '105_100RUS_05DES', 
    '106_100RUS_06TRS', '107_100RUS_07SUB', '108_100RUS_08BAT', '109_100RUS_09CAR', '201_200GER_01INF', 
    '202_200GER_02TNK', '203_200GER_03FGT', '204_200GER_04BMB', '205_200GER_05DES', '206_200GER_06TRS', 
    '207_200GER_07SUB', '208_200GER_08BAT', '209_200GER_09CAR', '301_300UK_01INF', '302_300UK_02TNK', 
    '303_300UK_03FGT', '304_300UK_04BMB', '305_300UK_05DES', '306_300UK_06TRS', '307_300UK_07SUB', 
    '308_300UK_08BAT', '309_300UK_09CAR', '401_400JAP_01INF', '402_400JAP_02TNK', '403_400JAP_03FGT', 
    '404_400JAP_04BMB', '405_400JAP_05DES', '406_400JAP_06TRS', '407_400JAP_07SUB', '408_400JAP_08BAT', 
    '409_400JAP_09CAR', '501_500USA_01INF', '502_500USA_02TNK', '503_500USA_03FGT', '504_500USA_04BMB', 
    '505_500USA_05DES', '506_500USA_06TRS', '507_500USA_07SUB', '508_500USA_08BAT', '509_500USA_09CAR'
]

# Create exact_piece subfolders in destination directories
for key, dst_folder in dst_folders.items():
    exact_piece_folder = os.path.join(dst_folder, 'exact_piece')
    os.makedirs(exact_piece_folder, exist_ok=True)
    for classification in exact_piece_classifications:
        os.makedirs(os.path.join(exact_piece_folder, classification), exist_ok=True)

# Function to copy files to their respective directories
def copy_files():
    # Start the timer
    start_time = time.time()

    # Count the files to ensure correct distribution
    count_files = defaultdict(int)
    for file_name in os.listdir(src_folder):
        if file_name.endswith('.png'):
            prefix = file_name.split('_')[0]
            if prefix in dst_folders:
                parts = file_name.split('_')
                if len(parts) > 5:
                    exact_piece_class = '_'.join(parts[5:8])  # Combine the 6th, 7th, and 8th parts
                    if exact_piece_class in exact_piece_classifications:
                        dst_folder = os.path.join(dst_folders[prefix], 'exact_piece', exact_piece_class)
                        src_file = os.path.join(src_folder, file_name)
                        dst_file = os.path.join(dst_folder, file_name)
                        try:
                            shutil.copy2(src_file, dst_file)
                            logging.info(f"Copied {src_file} to {dst_file}")
                            count_files[prefix] += 1
                        except Exception as e:
                            logging.error(f"Error copying {src_file} to {dst_file}: {e}")

    # Verify the counts
    expected_count = 34
    for prefix in dst_folders.keys():
        for classification in exact_piece_classifications:
            classification_folder = os.path.join(dst_folders[prefix], 'exact_piece', classification)
            actual_count = len(os.listdir(classification_folder))
            if actual_count != expected_count:
                logging.warning(f"{prefix} {classification} files count mismatch. Expected {expected_count}, found {actual_count}.")
            else:
                logging.info(f"All {prefix} {classification} files copied successfully: {actual_count} files.")

    # Stop the timer
    end_time = time.time()
    elapsed_time = end_time - start_time
    logging.info(f"File copying completed successfully in {elapsed_time:.2f} seconds.")
    print(f"File copying completed successfully in {elapsed_time:.2f} seconds.")

if __name__ == "__main__":
    try:
        copy_files()
        logging.info("File copying completed successfully.")
    except Exception as e:
        logging.critical(f"An unexpected error occurred: {e}")


File copying completed successfully in 14.04 seconds.