# IMPORT LIBRARIES

In [None]:
import sys
import random
import re
import numpy as np
import torch

from tqdm import tqdm
from pathlib import Path
from SIBI_classifier.logger.logging import log_manager
from SIBI_classifier.utils.main_utils import custom_title_print
from concurrent.futures import ThreadPoolExecutor
from SIBI_classifier.configuration.configuration import ConfigurationManager
from SIBI_classifier.exception import SIBIClassificationException

collect_and_combine_images_logger = log_manager.setup_logger("CollectAndCombineImagesLogger", console_output=False)

# SETUP CONSTANTS
In this section, we will define some constants that are important for data processing and model training.
These constants will help us set the dataset folder path, the extension pattern of the image files to be collected, and which folders to access.
Later, these constants will be used in various parts of the code.

## function

In [3]:
def collect_images_with_regex_and_count(
        path, 
        folder_classes, 
        extensions_pattern
    ):
    """
    Collects images from specified directories that match a given file extension pattern.

    Args:
        path (str): The root directory path containing the folder classes.
        folder_classes (list): List of folder names representing different classes.
        extensions_pattern (str): Regex pattern to match file extensions.

    Returns:
        dict: A dictionary where keys are folder classes and values are lists of image paths.
    """
    try:
        # Initialize a dictionary to hold image paths for each class
        image_paths = {folder_class: [] for folder_class in folder_classes}
        
        # Compile the regex pattern for matching file extensions
        pattern = re.compile(str(extensions_pattern), re.IGNORECASE)

        # Iterate over each class folder
        for folder_class in folder_classes:
            folder_path = Path(path) / folder_class

            # Recursively search for files matching the pattern in each class folder
            for file_path in tqdm(folder_path.rglob("*"), desc=f"Collecting from {folder_class}", unit=" paths"):
                if pattern.search(file_path.suffix):
                    image_paths[folder_class].append(file_path)

        return image_paths

    except Exception as e:
        # Handle exceptions and return an empty dictionary if no classes are retrieved
        print(f"No classes are retrieved from directory due to an error: {e}")
        return {}

In [4]:
def get_random_images(
        image_paths, 
        num_samples, 
        seed=42
    ):
    """
    Retrieves a random number of images from the image path list.
    Args:
        image_paths (list): A list of image paths.
        num_samples (int): The number of images to retrieve. If None, all images will be selected.
        seed (int): Seed to control the random retrieval results so that the results can be reproduced. Default is 42.
    Returns:
        list: A list of randomly selected image paths.
    """

    try:
        random.seed(seed)
        return random.sample(image_paths, min(len(image_paths) if num_samples is None else num_samples, len(image_paths)))

    except Exception as e:
        raise SIBIClassificationException(e, sys)

In [5]:
def collect_and_combine_images(
        classes, 
        train_path=None, 
        valid_path=None,
        test_path=None, 
        pattern_regex=r"\.(jpe?g)$", 
        num_images_per_class=None, 
        seed=42
    ):
    """
    Collects and merges images from the training and validation folders, and retrieves a random number of images from each class.
    Args:
        classes (list): List of classes (folder names) to process.
        train_path (str): The main path of the training folder that contains image data sub-folders.
        valid_path (str): The main path of the validation folder that contains the image data sub-folders.
        pattern_regex (str): The regex pattern for matching image file extensions (e.g. r'\.(jpg|png|jpeg)$').
        num_images_per_class (dict): Dictionary containing the number of images to fetch for each class. If None, all images will be retrieved.
        seed (int): Seed for random image retrieval. Default is 42.
    Returns:
        list: A combined list of image paths from the training and validation folders that were randomly picked.
    """

    try:
        def process_class(cls):
            # Combine images from training and validation for each class
            all_train_images = train_images_paths.get(cls, [])
            all_valid_images = valid_images_paths.get(cls, [])
            all_test_images = test_images_paths.get(cls, [])
            all_combined_images = all_train_images + all_valid_images + all_test_images

            # Retrieve a random number of images from the combined image
            return get_random_images(
                image_paths=all_combined_images,
                num_samples=None if num_images_per_class is None else num_images_per_class.get(cls, len(all_combined_images)),
                seed=seed
            )

        custom_title_print(f"COLLECT {classes} FROM TRAINING DATA")
        train_images_paths = collect_images_with_regex_and_count(train_path, classes, pattern_regex)
        custom_title_print(f"=")
        print()

        # Print the title for the image collection process of the validation data
        custom_title_print(f"COLLECT {classes} FROM VALIDATION DATA")
        valid_images_paths = collect_images_with_regex_and_count(valid_path, classes, pattern_regex)
        custom_title_print(f"=")
        print()

        # Print the title for the image collection process of the test data
        custom_title_print(f"COLLECT {classes} FROM TEST DATA")
        test_images_paths = collect_images_with_regex_and_count(test_path, classes, pattern_regex)
        custom_title_print(f"=")
        print()

        # Print titles for the process of merging images from training and validation
        custom_title_print(f"COMBINING {classes} FROM TRAINING AND VALIDATION DATA")

        random_images = {}

        # Using ThreadPoolExecutor to speed up the process of fetching images from each class in parallel
        with ThreadPoolExecutor() as executor:
            results = executor.map(process_class, classes)

        # Store the random image results for each class into the dictionary
        for cls, images in zip(classes, results):
            random_images[cls] = images
            collect_and_combine_images_logger.info(f"Total {cls} taken: {log_manager.color_text(len(random_images[cls]), 'yellow')}")

        # Merge all image paths from all classes
        all_images_paths = sum(random_images.values(), [])
        all_images_paths = [str(path) for path in all_images_paths]
        custom_title_print(f"Total images taken: {len(all_images_paths)}")

        return all_images_paths

    except Exception as e:
        raise SIBIClassificationException(e, sys)

## main program

In [None]:
CONFIG = ConfigurationManager()
data_preprocessing_config = CONFIG.get_data_preprocessing_config()
data_ingestion_config = CONFIG.get_data_ingestion_config()
logger = log_manager.setup_logger("DataPreprocessingLogger")

np.random.seed(data_preprocessing_config.seed)
random.seed(data_preprocessing_config.seed)
torch.manual_seed(data_preprocessing_config.seed)

[32m[ 2024-11-28 13:00:18 ] CreateDirectoriesLogger[0m - [32mINFO[0m - created directory at: [96martifacts[0m
[32m[ 2024-11-28 13:00:18 ] CreateDirectoriesLogger[0m - [32mINFO[0m - created directory at: [96martifacts/<model_name>/data_preprocessing/objects[0m
[32m[ 2024-11-28 13:00:18 ] CreateDirectoriesLogger[0m - [32mINFO[0m - created directory at: [96martifacts/<model_name>/data_ingestion/SIBI_dataset[0m


In [None]:
logger.info("Collecting and combining images from training and validation folders...")

all_images_paths = collect_and_combine_images(
    classes = data_preprocessing_config.label_list,
    train_path  = data_ingestion_config.data_download_store_train_dir_path,
    pattern_regex = data_preprocessing_config.image_extension_regex,
    seed= data_preprocessing_config.seed
)



Collecting from A: 200 paths [00:00, 57420.82 paths/s]
Collecting from B: 200 paths [00:00, 54602.67 paths/s]
Collecting from C: 200 paths [00:00, 44348.97 paths/s]
Collecting from D: 200 paths [00:00, 44334.91 paths/s]
Collecting from E: 200 paths [00:00, 73908.44 paths/s]
Collecting from F: 200 paths [00:00, 45779.35 paths/s]
Collecting from G: 200 paths [00:00, 53227.21 paths/s]
Collecting from H: 200 paths [00:00, 37452.49 paths/s]
Collecting from I: 200 paths [00:00, 32819.28 paths/s]
Collecting from J: 200 paths [00:00, 49929.22 paths/s]
Collecting from K: 200 paths [00:00, 42607.72 paths/s]
Collecting from L: 200 paths [00:00, 27420.02 paths/s]
Collecting from M: 200 paths [00:00, 52298.05 paths/s]
Collecting from N: 200 paths [00:00, 31769.01 paths/s]
Collecting from O: 200 paths [00:00, 47532.91 paths/s]
Collecting from P: 200 paths [00:00, 52576.67 paths/s]
Collecting from Q: 200 paths [00:00, 46386.91 paths/s]
Collecting from R: 200 paths [00:00, 54888.49 paths/s]
Collecting


No classes are retrieved from directory due to an error: expected str, bytes or os.PathLike object, not NoneType

No classes are retrieved from directory due to an error: expected str, bytes or os.PathLike object, not NoneType



In [9]:
log_manager.clean_log_file()