In [15]:
from PIL import Image
import numpy as np
import os

def load_data(*path: str, labels: list, skip: bool = True) -> np.ndarray:
    """
    Loads data from a given path and returns a numpy array with the image and label of each file.

    Parameters:
    -----------
    path: str -> Path to the data folder. (can be multiple paths)
    labels: list -> List of labels for each path.
    skip: bool -> Skip files that are not images or cannot be read. (default: True)

    Returns:
    --------
    data_array: np.ndarray -> Numpy array with the image and label of each file.
    """

    # list to store the data. 
    data_array = []
    skipped_files = 0 # Number of skipped files.

    # ---------------------------------------------------
    # Check if the number of paths and labels are equal.
    # If not equal raise an error.
    # ---------------------------------------------------
    if len(path) == len(labels):

        # Loop through the paths and labels.
        for p, l in zip(path, labels):

            # --------------------------
            # Check if the path exists.
            # If not raise an error.
            # --------------------------    
            if os.path.exists(p):
                
                # ----------------------------------------------
                # Loop through the files in the path.
                # Append the image and label to the list.
                # ----------------------------------------------
                for file in os.listdir(p):
                    # Try to catch errors when reading the image.
                    try:
                        # Append the image and label to the list.
                        data_array.append([Image.open(os.path.join(p, file)), l])

                    except Image.UnidentifiedImageError as e:
                        # If skip is True, skip the file.
                        if skip:
                            skipped_files += 1 
                        else:
                            raise e
            else:
                raise FileNotFoundError("Path does not exist")
    else:
        raise ValueError("Number of paths and labels must be equal")
    
    if skipped_files > 0:
        print(f"Skipped {skipped_files} files")

    # Return the numpy array.
    return np.array(data_array, dtype=object)

In [16]:
path_fire = './Data/fire/fire/fire-images/'
path_forest = './Data/fire/fire/forest-images/'

dataset = load_data(path_fire, path_forest, labels=['fire', 'forest'])

print(f"dataset len: {len(dataset)}")
print(f"dataset shape: {dataset.shape}")
print(f"dataset fire images: {len([index for index in dataset if index[1] == 'fire'])}")
print(f"dataset forest images: {len([index for index in dataset if index[1] == 'forest'])}")

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part.

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dataset[:, 0], dataset[:, 1], test_size=0.2, random_state=42)



In [23]:
l = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
l = np.array(l)

In [26]:
print(l)

[[1 2 3]
 [4 5 6]
 [7 8 9]]
