# Image Classification Dataset preparation Operations

### 1. Load the Data Directory

- Ensure that your data directory contains subfolders named according to each class of images. For example:

  ```
  data_directory/
  ├── class1/
  ├── class2/
  └── class3/
  ```

### 2. Convert all .png Files to .jpg Files

- Use a script to iterate through each class folder and convert all `.png` images to `.jpg` format.

### 3. Convert all .webp Files to .jpg Files

- Use a similar script to convert `.webp` images to `.jpg` format.

### 4. Convert .jpeg files to .jpg files

- Use a similar script to convert `.jpeg` images to `.jpg` format.


### 5. Resize all images to ensure consistency

- Use a the function to resize all images equally.


### 6. Create Extra Images (Rotated and Black/White)

- Augment the dataset by creating rotated (90, 180, 270 degrees) and black-and-white versions of each image.

### 7. Rename All Images in a Directory

- Ensure that all images have unique and consistent filenames.

### 8. Verify the .jpg Images in a Directory

- Check all images to ensure they are in `.jpg` format and valid.

### 9. Remove Any Truncated Images in a Directory

- Remove any images that are truncated or cannot be opened.

### 10. Split Your Data into Training and Validation Directories

- Split the dataset into training and validation sets.

## Find all sub-Directories in a directory

In [None]:
import os

def find_subdirectories(directory):
    """
    Find all subdirectories within a given directory and add them to a list.
    If no subdirectories are found, return the passed-in directory itself.
    
    Parameters:
    - directory: The path to the main directory.
    
    Returns:
    - A list of subdirectory paths if subdirectories are found,
      otherwise, return the passed-in directory itself.
    """
    subdirectories = [os.path.join(directory, d) for d in os.listdir(directory) if os.path.isdir(os.path.join(directory, d))]
    if subdirectories:
        return subdirectories
    else:
        return [directory]

# Example usage:
main_directory = './images'
subdirectories = sorted(find_subdirectories(main_directory))
print(subdirectories)


## Convert all .png files to .jpg

In [None]:
import os
from PIL import Image

def convert_png_to_jpg(directory):
    """
    Convert PNG images to JPG format in the specified directory.

    Parameters:
        directory (str): The path to the directory containing PNG images.

    Returns:
        None
    """
    for file in os.listdir(directory):
        if file.endswith(".png"):
            png_path = os.path.join(directory, file)
            jpg_path = os.path.join(directory, file.replace(".png", ".jpg"))
            img = Image.open(png_path)
            img.convert("RGB").save(jpg_path, "JPEG")
            os.remove(png_path)

# Example usage
# main_directory = './images2/test'
# convert_png_to_jpg(main_directory)

# Loop through all sub directories in the data directory
for i in subdirectories:
    print("converting files in :" + i)
    convert_png_to_jpg(i)



## Loop through a dir and convert .webp to .jpg

In [None]:
import os
from PIL import Image

def convert_webp_to_jpg(folder_path):
    """
    Converts all .webp images in the specified folder to .jpg format.
    
    Parameters:
    - folder_path: The path to the folder containing .webp images.
    """
    # Get the list of files in the folder
    file_list = os.listdir(folder_path)

    # Iterate through the files
    for file_name in file_list:
        # Check if the file is a .webp image
        if file_name.endswith(".webp"):
            # Create the full file path
            file_path = os.path.join(folder_path, file_name)
            # Open the .webp image
            with Image.open(file_path) as image:
                # Convert the image to RGB mode (if needed)
                if image.mode == "RGBA":
                    image = image.convert("RGB")
                # Convert the image to .jpg format
                jpg_file_path = file_path.replace(".webp", ".jpg")
                image.save(jpg_file_path, "JPEG")
                os.remove(file_path)

# Example usage
# main_directory = './images2/test'
# convert_webp_to_jpg(main_directory)
# Loop through all sub directories in the data directory
for i in subdirectories:
    print("converting files in :"+i)
    convert_webp_to_jpg(i)


## Loop through a dir and convert jpeg file extensions to .jpg 

In [None]:
import os
from PIL import Image

def convert_jpeg_to_jpg(directory):
    """
    Convert JPEG images to JPG format in the specified directory.

    Parameters:
        directory (str): The path to the directory containing JPEG images.

    Returns:
        None
    """
    # Get a list of all files in the directory
    files = os.listdir(directory)
    # Iterate through each file in the directory
    for file in files:
        # Check if the file is a JPEG image
        if file.lower().endswith(".jpeg"):
            # Open the JPEG image
            with Image.open(os.path.join(directory, file)) as img:
                # Construct the output file path
                output_file = os.path.join(directory, os.path.splitext(file)[0] + ".jpg")
                # Save the image as JPG format
                img.convert("RGB").save(output_file, "JPEG")
                
            # Delete the original JPEG file
            os.remove(os.path.join(directory, file))

# Specify the directory containing JPEG images
# dir_path = './test'
# convert_jpeg_to_jpg(dir_path)
for i in subdirectories:
    print("converting files in :" + i)
    convert_jpeg_to_jpg(i)


## Ensue all training images are the same size

In [None]:
import os
from PIL import Image
import tqdm

def resize_images_in_subdirectories(directory, size=(48, 48)):
    """
    Resize all images in all subdirectories of the given directory.

    Parameters:
    - directory: The path to the main directory containing subdirectories.
    - size: The target size of the resized images. Default is (48, 48).
    """
    for subdir in os.listdir(directory):
        subdirectory_path = os.path.join(directory, subdir)
        if os.path.isdir(subdirectory_path):
            print(subdirectory_path)
            for filename in tqdm.tqdm(os.listdir(subdirectory_path)):   
                file_path = os.path.join(subdirectory_path, filename)
                if os.path.isfile(file_path):
                    try:
                        with Image.open(file_path) as img:
                            img_resized = img.resize(size, Image.LANCZOS)
                            img_resized.save(file_path)
                    except IOError:
                        print(f'Cannot open or process the file: {file_path}')

# Example usage:
# main_directory = './images'
resize_images_in_subdirectories(main_directory)

## Create rotated and black/white images of each image in a directory
1 X 8

In [None]:
import os
from PIL import Image

def process_images(image_dir):
    """
    Process images in the given directory:
    - Create rotated copies (90, 180, 270 degrees).
    - Create black and white copies of all images including rotated ones.
    
    Parameters:
    - image_dir: Directory containing the images to process.
    """
    if not os.path.isdir(image_dir):
        raise ValueError(f"{image_dir} is not a valid directory")
    
    for file in os.listdir(image_dir):
        file_path = os.path.join(image_dir, file)
        if os.path.isfile(file_path):
            try:
                # Open the image
                with Image.open(file_path) as img:
                    # Process original image
                    process_image(img, file, image_dir)
                    
                    # Create rotated copies
                    for angle in [90, 180, 270]:
                        rotated_img = img.rotate(angle)
                        process_image(rotated_img, f"{os.path.splitext(file)[0]}_rot{angle}{os.path.splitext(file)[1]}", image_dir)

            except Exception as e:
                print(f"Failed to process {file}: {e}")

def process_image(img, file_name, image_dir):
    """

    Save the given image and its black and white version in the specified directory.
    
    Parameters:
    - img: The image to process.
    - file_name: The base file name for the image.
    - image_dir: The directory to save the images.
    """
    # Save the original (or rotated) image
    img.save(os.path.join(image_dir, file_name))
    #print(f"Saved image: {os.path.join(image_dir, file_name)}")
    
    # Create and save the black and white version
    bw_img = img.convert("L")
    bw_file_name = f"{os.path.splitext(file_name)[0]}_bw{os.path.splitext(file_name)[1]}"
    bw_img.save(os.path.join(image_dir, bw_file_name))
    #print(f"Saved black and white image: {os.path.join(image_dir, bw_file_name)}")

# Example usage:
# image_directory = './images/'
# process_images(image_directory)
# Loop through all sub directories in the data directory
for i in subdirectories:
    print("Creating 7X more images in:"+i)
    process_images(i)


## Rename All Images in a diirectory

In [None]:
def suffle_image_names(dir_name):
    """
    Shuffle the names of image files in the specified directory by adding a random triple-digit number prefix.

    Parameters:
        dir_name (str): The path to the directory containing image files.

    Returns:
        None
    """
    import os
    import random
    import string
    
    # Specify the directory containing your image files
    directory_path = dir_name  
    # Function to generate a random triple-digit number
    def generate_random_number():
        return str(random.randint(100000, 200000))
    # Iterate through the files in the directory
    for filename in os.listdir(directory_path):
        # Check if the file is an image (you can adjust the file extensions as needed)
        if filename.lower().endswith(('.jpg')):
            # Generate a random triple-digit number
            random_number = "00"+generate_random_number()
            # Get the file extension
            file_extension = os.path.splitext(filename)[1]
            # Create the new filename with the random number
            new_filename = random_number + file_extension
            while os.path.exists(os.path.join(directory_path, new_filename)):
                random_number = "00"+generate_random_number()
                new_filename = random_number + file_extension
    
            old_file_path = os.path.join(directory_path, filename)
            new_file_path = os.path.join(directory_path, new_filename)
          
            # Rename the file
            os.rename(old_file_path, new_file_path)
            #print(f'Renamed: {filename} -> {new_filename}')

# Example usage
# main_directory = './images/test'
# suffle_image_names(main_directory)
for i in subdirectories:
    print("Renaming images in:"+i)
    suffle_image_names(i)


## Verify .jpg images in a dir

In [None]:
def verify_jpg_images(folder_path):
    """
    Verify the integrity of JPG images in the specified folder.

    This function iterates through all files in the folder specified by `folder_path`.
    It checks if each file has a '.jpg' extension and attempts to open and verify its integrity using Pillow.
    If a file is found to be broken or corrupt, it prints a message indicating the issue.

    Parameters:
        folder_path (str): The path to the folder containing JPG images.

    Returns:
        None
    """
    import os
    from PIL import Image

    # Get the list of files in the folder
    file_list = os.listdir(folder_path)

    # Iterate through the files
    for file_name in file_list:
        # Check if the file is a .jpg image
        if file_name.endswith(".jpg"):
            # Create the full file path
            file_path = os.path.join(folder_path, file_name)
            
            try:
                # Open the .jpg image
                image = Image.open(file_path)
                image.verify()
            except (IOError, SyntaxError) as e:
                # Handle the broken or corrupt .jpg image
                print(f"Fixing broken or corrupt image: {file_path}")
                #os.remove(file_path)
                continue
            
            # Print the valid file path
        #print(f"Valid image: {file_path}")


# Example usage
# main_directory = './images/test'
# verify_jpg_images(main_directory)  
for i in subdirectories:
    print("Verifying images in:"+i)
    verify_jpg_images(i) 


## Loop through a dir for truncated images

In [None]:
def find_truncated_images(directory):
    """
    Find truncated images in the specified directory.

    This function iterates through all files in the directory specified by `directory`.
    It attempts to open each file using Pillow's `Image.open()` method and loads the image data.
    If an exception occurs during the loading process, it indicates that the image is truncated or corrupt,
    and the filename is added to the list of truncated images.

    Parameters:
        directory (str): The path to the directory containing images.

    Returns:
        list: A list of filenames of truncated images found in the directory.
    """
    import os
    from tqdm import tqdm
    from PIL import Image
    truncated_images = []
    for file in tqdm(os.listdir(directory)):
        file_path = os.path.join(directory, file)
        if os.path.isfile(file_path):
            try:
                # try to open that image and load the data
                Image.open(file_path).load()
            except Exception as e:
                truncated_images.append(file)
    return truncated_images


def get_subdirs(dir_path):
    """
    Get the subfolder names of a given directory and find truncated images within each subfolder.

    This function iterates through the subdirectories of the specified directory (`dir_path`).
    For each subdirectory, it finds truncated images using the `find_truncated_images` function,
    prints the number of truncated images found, and lists their filenames.

    Parameters:
        dir_path (str): The path to the directory containing subfolders.

    Returns:
        None
    """
    import os
    # Get the subfolder names of a given directory.
    classes = sorted([f.name for f in os.scandir(dir_path) if f.is_dir()])
    for name in classes:
        print(name)
        truncated_images = find_truncated_images(dir_path + name)
        print("Truncated Images:")
        print(len(truncated_images))
        for image in truncated_images:
            fout = dir_path + "/" + name + "/" + image
            print(fout) 

# Example usage
main_directory = './images/'
get_subdirs(main_directory) 


## Split your data into train and test directories at 80%

In [None]:
import os
import shutil
import random

def split_dataset(base_dir, train_dir, val_dir, val_ratio=0.2):
    """
        Directory structure prior to split
    |------->  Data Dir
                  |
       class1   class2   class3   etc

    Splits the dataset into training and validation sets and moves original data.
    
    Parameters:
    - base_dir: Directory containing the class subdirectories.
    - train_dir: Directory where the training set will be created.
    - val_dir: Directory where the validation set will be created.
    - val_ratio: Ratio of files to be used for validation.
    """
    # Ensure the output directories exist
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(val_dir, exist_ok=True)

    # Create the original_data directory
    original_data_dir = os.path.join(base_dir, 'original_data')
    os.makedirs(original_data_dir, exist_ok=True)

    # Get the list of class subdirectories, excluding 'train', 'validation', and 'original_data'
    classes = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d)) 
               and d not in ['train', 'validation', 'original_data']]

    for cls in classes:
        class_dir = os.path.join(base_dir, cls)
        train_class_dir = os.path.join(train_dir, cls)
        val_class_dir = os.path.join(val_dir, cls)
        
        # Ensure the class subdirectories exist in train and val directories
        os.makedirs(train_class_dir, exist_ok=True)
        os.makedirs(val_class_dir, exist_ok=True)
        
        # Get the list of files in the class directory
        files = [f for f in os.listdir(class_dir) if os.path.isfile(os.path.join(class_dir, f))]
        
        # Shuffle the files
        random.shuffle(files)
        
        # Split the files
        val_count = int(len(files) * val_ratio)
        val_files = files[:val_count]
        train_files = files[val_count:]
        
        # Copy files to the train and val directories
        for f in train_files:
            shutil.copy(os.path.join(class_dir, f), os.path.join(train_class_dir, f))
        
        for f in val_files:
            shutil.copy(os.path.join(class_dir, f), os.path.join(val_class_dir, f))
        
        print(f'Class {cls}: {len(train_files)} training files, {len(val_files)} validation files')

    # Move the original class directories to the original_data directory
    for cls in classes:
        original_class_dir = os.path.join(base_dir, cls)
        new_location = os.path.join(original_data_dir, cls)
        shutil.move(original_class_dir, new_location)
    
    print(f'Moved original class directories to {original_data_dir}')

# Example usage:
data_dir = "images"
train_dir = os.path.join(data_dir, "train")
val_dir = os.path.join(data_dir, "validation")
split_dataset(data_dir, train_dir, val_dir)


# Congratulations, Now you are ready to train your model