## Import Libraries

In [7]:
import os                               # For interacting with the operating system
from PIL import Image, UnidentifiedImageError  # To work with images and handle image-related errors
import numpy as np                      # For numerical computations and array manipulation
from sklearn.model_selection import cross_val_predict  # To perform cross-validation and generate predictions
from sklearn.preprocessing import LabelEncoder         # To encode labels into integers
from xgboost import XGBClassifier       # Import the XGBoost classifier for machine learning tasks
import cleanlab                         # For handling noisy labels in datasets
import random                           # To generate random numbers or shuffle data


## Image filltering

In [33]:
class InvalidImageSizeError(Exception):
    """Custom exception raised when an image does not meet the required size."""
    pass

# Path of all image files
base_path = r'C:\\Users\\cbado\\OneDrive - University of Huddersfield\\Year 3\\Data Driven AI\\Assignment 2\\hand images'

# Dictionaries to track the number of errors for each stage
unable_to_open_count = {f"Stage{i}": 0 for i in range(1, 9)}  # Count for images that couldn't be opened
invalid_size_count = {f"Stage{i}": 0 for i in range(1, 9)}  # Count for images that don't match the expected size
invalid_extension_count = {f"Stage{i}": 0 for i in range(1, 9)}  # Count for images with unsupported file extensions

train_data = []  # A list to hold the image data (model input)
image_paths = []  # A list to hold the paths to the images
labels = []  # A list to store the labels


# Traverse through each Stage folder and process images
for folder in os.listdir(base_path):
    folder_path = os.path.join(base_path, folder)  # Construct the full path to the current stage folder
    stage_key = folder  # Use the folder name as a unique identifier for counting issues in this stage
    for file in os.listdir(folder_path):  # Loop through all files in the current stage folder
        # Check if the file has a valid image extension
        if file.endswith((".png", ".jpg", ".jpeg", ".PNG", ".JPG", ".JPEG",)):
            img_path = os.path.join(folder_path, file)  # Get the full path of the image file

            try:
                img = Image.open(img_path).convert("L")  # Open the image and convert it to grayscale
                # Check if the image is the required size of 150x150 pixels
                if img.size != (150, 150):
                    raise InvalidImageSizeError()  # Raise an error if the image size is not 150x150
                
                # Flatten the image and append it to the training data list
                train_data.append(np.array(img).flatten())
                image_paths.append(img_path)  # store the image path
                labels.append(folder)  # Store the folder name as the label
                
            except (InvalidImageSizeError, UnidentifiedImageError, OSError) as e:
                # Handle the errors during image processing
                if isinstance(e, InvalidImageSizeError):
                    print(f"Image size of {file} is {img.size}, but it must be 150x150.\n")
                    invalid_size_count[stage_key] += 1  # Increment the count for invalid size in the current stage
                elif isinstance(e, UnidentifiedImageError):
                    unable_to_open_count[stage_key] += 1  # Increment the count for images that couldn't be opened
                    print(f"Unable to open Image: {file}\n")  # Print the error message for failed image opening
                else:
                    pass  # If there is another error, simply skip without incrementing counts
                    print(f"Error processing image {file}: {e}\n")
        else:
            invalid_extension_count[stage_key] += 1  # Increment the count for images with invalid extensions
            print(f"{file} has invalid extension\n")  # Print the filename with an unsupported extension


Image size of 10_Stage_1_U2254663.jpg is (290, 236), but it must be 150x150.

1_10_2162526.HEIC has invalid extension

Image size of 1_10_2254417.jpg is (177, 177), but it must be 150x150.

1_1_2162526.HEIC has invalid extension

Image size of 1_1_2254417.jpg is (177, 177), but it must be 150x150.

1_2_2162526.HEIC has invalid extension

Image size of 1_2_2254417.jpg is (177, 177), but it must be 150x150.

1_3_2162526.HEIC has invalid extension

Image size of 1_3_2254417.jpg is (177, 177), but it must be 150x150.

1_4_2162526.HEIC has invalid extension

Image size of 1_4_2254417.jpg is (177, 177), but it must be 150x150.

1_5_2162526.HEIC has invalid extension

Image size of 1_5_2254417.jpg is (177, 177), but it must be 150x150.

1_6_2162526.HEIC has invalid extension

Image size of 1_6_2254417.jpg is (177, 177), but it must be 150x150.

1_7_2162526.HEIC has invalid extension

Image size of 1_7_2254417.jpg is (177, 177), but it must be 150x150.

1_8_2162526.HEIC has invalid extension



In CHS2406 Assignment 2 notebook section3, reasons for the method of handling the errors can be found.

The reason for using grayscale on image structure was due to the fact that I would get low memory error when using RGB.

## Readily Available Model Training

In [8]:
train_data = np.array(train_data)  # Convert to a NumPy array

# Convert labels to numeric values using LabelEncoder
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(labels)  

# Initialised XGBoost model
#cross-validation to get prediction probabilities
model = XGBClassifier(tree_method="hist", enable_categorical=True)
# Generate predicted probabilities for each sample (method='predict_proba')
pred_probs = cross_val_predict(model, train_data, train_labels, method='predict_proba', cv=10)


1. **Data Conversion:** Converts training data to a NumPy array for compatibility with machine learning models.

2. **Label Encoding:** Converts categorical labels to numeric values, as required by most machine learning models.

3. **XGBoost Configuration:** 
   - Uses a histogram-based method (`tree_method="hist"`) for faster tree-building.
   - Enables direct handling of categorical features with `enable_categorical=True`.

4. **Cross-Validation and Probability Prediction:**
   - Cross-validation (`cv=10`) improves model performance estimation.
   - `predict_proba` provides class probabilities, giving more information about model d evaluation.


## Image Masking and File Writing

In [12]:
####################################################################################################
#Filtering Images
####################################################################################################


# Use Cleanlab's filter function
cl_issue_idx = cleanlab.filter.find_label_issues(train_labels, pred_probs, return_indices_ranked_by='self_confidence')

# Initialise a dictionary to count the number of mislabelled images per stage
mislabelled_count = {f"Stage{i}": 0 for i in range(1, 9)}  # Stages 1-8
for idx in cl_issue_idx:  # Iterate over the indices of mislabelled images
    stage_label = labels[idx]  # Get the stage label (folder name) of the mislabelled image
    mislabelled_count[stage_label] += 1  # Increment the mislabel count for the corresponding stage

# Print the results: Number of label issues identified and the indices of the issues
print(f"Number of potential label issues identified: {len(cl_issue_idx)}")
print(f"Indices of label issues: {cl_issue_idx}")

# Mask to filter out the problematic data points
mask = np.ones(len(train_labels), dtype=bool)  # All data points start marked as True
mask[cl_issue_idx] = False

# Filtering the training data, labels, and image paths using the mask
train_data = train_data[mask]  # Mask applied to filter out mislabelled data from train_data
train_labels = np.array(train_labels)[mask]  # Filter out mislabelled labels
image_paths = np.array(image_paths)[mask]  # Filter out mislabelled image paths


print(f"Filtered train data shape: {train_data.shape}")
print(f"Filtered labels shape: {train_labels.shape}")



####################################################################################################
####################################################################################################
#Error Logging
####################################################################################################
####################################################################################################


# Function to write the counts of images with various issues to a file
def write_stage_counts_to_file(title, counts, filename):
    with open(filename, 'w') as file:
        file.write(title + "\n")  # Write the title of the report
        for stage, count in counts.items():  # Write each stage's count
            file.write(f"{stage}: <{count}>\n")

# Various issue counts (unable to open, invalid size, mislabelled, invalid extension) to separate text files
write_stage_counts_to_file("Images that couldn't be opened:", unable_to_open_count, "unable_to_open.txt")
write_stage_counts_to_file("Images that were not 150x150:", invalid_size_count, "invalid_size.txt")
write_stage_counts_to_file("Images that were mislabelled:", mislabelled_count, "mislabelled.txt")
write_stage_counts_to_file("Images that had invalid extension:", invalid_extension_count, "invalid_extension.txt")


####################################################################################################
####################################################################################################
####################################################################################################
####################################################################################################






####################################################################################################
#Valid Data Storage
####################################################################################################


# Initialised list of selected image paths and labels based on filtered labels
selected_image_paths_and_labels = []

# Unique stages from the filtered labels
unique_stages = set(train_labels) 

for stage in unique_stages:  # Iterate over each unique stage
    # Find the indices of images corresponding to the current stage
    stage_indices = [i for i, label in enumerate(train_labels) if label == stage]

    # Append the image paths and labels of the current stage to the list
    for idx in stage_indices:
        selected_image_paths_and_labels.append((image_paths[idx], train_labels[idx]))

# Write the image paths and their associated labels to a text file
with open("image_paths_and_labels.txt", "w") as file:
    for img_path, label in selected_image_paths_and_labels:
        file.write(f"{img_path},{label}\n")  # Format the data as path,label

print("Selected image paths and labels saved to image_paths_and_labels.txt.")

Number of potential label issues identified: 6654
Indices of label issues: [1405 4494 1409 ... 9655 2193 2733]
Filtered train data shape: (4496, 22500)
Filtered labels shape: (4496,)
Selected image paths and labels saved to image_paths_and_labels.txt.


Above all the "mislabelled" images are fitlered out (using cleanlab) and are sotored in a text file to be used in the CHS2406 Assignment 2 notebook. Also, number of errors present in every stage are recorded and stored in seperate text files.