In [1]:
import os
import cv2
import pandas as pd
from pathlib import Path
import sys
import numpy as np

# Import sampling methods
from BHSIG260_Bengali_Sampling import sample_dataset_bhsig260_bengali
from BHSIG260_Hindi_Sampling import sample_dataset_bhsig260_hindi
from Cedar_sampling import sample_dataset_cedar
from Real_Fake_Data_Sampling import sample_dataset_real_fake
from Signature_Verification_sampling import sample_signature_verification_dataset
from Hansig_sampling import sample_dataset_hansig

# Set project root directory and add to Python path
project_root = Path.cwd()
sys.path.append(str(project_root))

# Import utility functions for image preprocessing
import img_preprocessing_util_functions as img_utils

def preprocess_image(image_path, steps, switches, hyperparams):
    """
    Applies a sequence of preprocessing steps to an image, including optional grayscale and binary conversion.
    """
    # Read in the image
    image = cv2.imread(image_path)
    if image is None:
        raise ValueError(f"Failed to load image: {image_path}")
    
    # perform greyscale first ; if greyscale = False, don't perform grey_to_binary
    if switches.get('grayscale', False):
        image = img_utils.rgb_to_grey(image)
    if switches.get('grey_to_binary', False):
        image = img_utils.grey_to_binary(image)

    
    for step in steps:
        if switches.get(step, False):
            if step == 'clahe':
                clahe = cv2.createCLAHE(clipLimit=hyperparams[step]['clipLimit'], 
                                        tileGridSize=(hyperparams[step]['tileGridSize'], hyperparams[step]['tileGridSize']))
                image = clahe.apply(image)  # CLAHE applied on grayscale image
            elif step == 'gaussian_blur':
                image = cv2.GaussianBlur(image, (5, 5), hyperparams[step]['sigma'])
            elif step == 'adaptive_threshold':
                image = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,
                                              blockSize=hyperparams[step]['blockSize'], C=hyperparams[step]['C'])

    

    # Additional binary-only steps
    for step in steps:
        if switches.get(step, False):
            if step == 'skeletonize':
                image = img_utils.skeletonize_image(image)
            elif step == 'augment':
                image = img_utils.augment_image(image)
    
    # Ensure image is in uint8 format before returning
    image = (image * 255).astype(np.uint8) if image.dtype == bool else image
    return image




def preprocess_data_from_sampling_folder(sampling_folder_path, output_folder_path, steps, switches, hyperparams):
    preprocessed_data = []
    
    # Traverse through each seed folder in the sampling folder
    for seed_folder in os.listdir(sampling_folder_path):
        seed_path = os.path.join(sampling_folder_path, seed_folder)
        if not os.path.isdir(seed_path):
            continue
        
        # Output directory for the current seed
        seed_output_dir = os.path.join(output_folder_path, seed_folder)
        os.makedirs(seed_output_dir, exist_ok=True)
        
        # Traverse through each person folder within the seed folder
        for person_folder in os.listdir(seed_path):
            person_path = os.path.join(seed_path, person_folder)
            if not os.path.isdir(person_path):
                continue

            # Create person subdirectories in the output
            person_output_dir = os.path.join(seed_output_dir, person_folder)
            true_output_dir = os.path.join(person_output_dir, "true")
            forged_output_dir = os.path.join(person_output_dir, "forged")
            os.makedirs(true_output_dir, exist_ok=True)
            os.makedirs(forged_output_dir, exist_ok=True)

            # Process images in "true" folder
            true_label_path = os.path.join(person_path, "true")
            if os.path.isdir(true_label_path):
                image_files = [f for f in os.listdir(true_label_path) if f.endswith((".jpeg", ".jpg", ".png"))]
                for idx, img_file in enumerate(image_files):
                    img_path = os.path.join(true_label_path, img_file)
                    processed_img = preprocess_image(img_path, steps, switches, hyperparams)
                    output_path = os.path.join(true_output_dir, f"{person_folder}_true_{idx + 1}.png")
                    cv2.imwrite(output_path, processed_img)
                    preprocessed_data.append({
                        "Seed": seed_folder,
                        "Person ID/Name": person_folder,
                        "Label": "true",
                        "Image File": output_path
                    })
            else:
                print(f"Warning: Missing 'true' folder for {person_folder} in seed {seed_folder}.")

            # Process images in "forged" or "forge" folder
            forged_label_path = os.path.join(person_path, "forged")
            if not os.path.isdir(forged_label_path):  # If "forged" does not exist, check for "forge"
                forged_label_path = os.path.join(person_path, "forge")
            if os.path.isdir(forged_label_path):
                image_files = [f for f in os.listdir(forged_label_path) if f.endswith((".jpeg", ".jpg", ".png"))]
                for idx, img_file in enumerate(image_files):
                    img_path = os.path.join(forged_label_path, img_file)
                    processed_img = preprocess_image(img_path, steps, switches, hyperparams)
                    output_path = os.path.join(forged_output_dir, f"{person_folder}_forged_{idx + 1}.png")
                    cv2.imwrite(output_path, processed_img)
                    preprocessed_data.append({
                        "Seed": seed_folder,
                        "Person ID/Name": person_folder,
                        "Label": "forged",
                        "Image File": output_path
                    })
            else:
                print(f"Warning: Missing 'forged' or 'forge' folder for {person_folder} in seed {seed_folder}.")

    return preprocessed_data


def run_sampling_methods_with_preprocessing(methods, params, base_output_dir, steps, switches, hyperparams):
    all_preprocessed_data = []

    for method, method_name in methods:
        method_params = params.get(method_name)
        
        if method_params:
            print(f"Running {method_name} with parameters: {method_params}")
            method_output_dir = os.path.join(base_output_dir, f"{method_name}_Dataset")
            os.makedirs(method_output_dir, exist_ok=True)
            
            language = method_params.pop('language', 'Unknown')
            num_individuals = method_params['num_individuals']
            destination_path = method_params['destination_path']
            
            # Filter parameters for each method
            if method_name == 'CEDAR':
                filtered_params = {
                    'data_path': method_params['data_path'],
                    'destination_path': destination_path,
                    'num_individuals': num_individuals,
                    'seeds': method_params['seeds'],
                    'number_of_signatures': method_params['number_of_signatures']
                }
            elif method_name == 'Signature_Verification':
                filtered_params = {
                    'data_path': method_params['data_path'],
                    'destination_path': destination_path,
                    'num_individuals': num_individuals,
                    'seed': method_params['seed'],
                    'number_of_signatures': method_params['number_of_signatures']
                }
            elif method_name in ['BHSig260_Bengali', 'BHSig260_Hindi', 'Hansig']:
                filtered_params = {
                    'data_path': method_params['data_path'],
                    'destination_path': destination_path,
                    'num_individuals': num_individuals,
                    'seed': method_params['seed'],
                    'number_of_signatures': method_params['number_of_signatures']
                }
            elif method_name == 'Real_Fake_Data':
                filtered_params = {
                    'data_path': method_params['data_path'],
                    'destination_path': destination_path,
                    'num_individuals': num_individuals,
                    'seed': method_params['seed'],
                    'number_of_signatures': method_params['number_of_signatures']
                }
            
            print(f"Calling {method_name} with filtered parameters: {filtered_params}")
            method(**filtered_params)
            print(f"{method_name} sampling completed. Proceeding with preprocessing...\n")

            # Run preprocessing on the entire sampling folder for this method
            preprocessed_data = preprocess_data_from_sampling_folder(
                destination_path, method_output_dir, steps, switches, hyperparams
            )
            
            print(f"Entries from {method_name}: {len(preprocessed_data)}")
            all_preprocessed_data.extend(preprocessed_data)
            
            print(f"Total entries after {method_name}: {len(all_preprocessed_data)}\n")

    # Save all preprocessed data to a CSV
    preprocessed_df = pd.DataFrame(all_preprocessed_data)
    preprocessed_csv_path = Path(base_output_dir) / 'preprocessed_signature_dataset.csv'
    preprocessed_df.to_csv(preprocessed_csv_path, index=False)
    print(f'Saved preprocessed data to {preprocessed_csv_path}')

    print("Data preview:")
    print(preprocessed_df.head())
    print(f"Final total number of entries: {len(preprocessed_df)}")

    return preprocessed_df




methods = [
    (sample_dataset_cedar, 'CEDAR'),
    (sample_signature_verification_dataset, 'Signature_Verification'),
    (sample_dataset_bhsig260_bengali, 'BHSig260_Bengali'), 
    (sample_dataset_bhsig260_hindi, 'BHSig260_Hindi'), 
    (sample_dataset_real_fake, 'Real_Fake_Data'),
    (sample_dataset_hansig, 'Hansig')
]



# Define preprocessing steps order, switches, and hyperparameters


# Run the method and get the preprocessed DataFrame
if __name__ == "__main__":
    
    # steps = ['skeletonize','augment','clahe', 'gaussian_blur', 'adaptive_threshold', ]
    steps = ['skeletonize','augment' ]


    switches = {
    'grayscale': True,             # Enable grayscale conversion
    'clahe': False,
    'gaussian_blur': False,
    'adaptive_threshold': False,
    'grey_to_binary': True,        # Enable binary conversion
    'skeletonize': True,
    'augment': True
    }


    hyperparams = {'clahe': {'clipLimit': 2.0, 'tileGridSize': 8}, 'adaptive_threshold': {'blockSize': 11, 'C': 2}, 'gaussian_blur': {'sigma': 1.5}}
    base_data_path = "/Users/hongmingfu/Desktop/Brown University/DATA2050"
    base_output_path = "/Users/hongmingfu/Desktop/Brown University/DATA2050/preprocessed_dataset"

    params = {
        'CEDAR': {
            'data_path': f"{base_data_path}/Cedar",
            'destination_path': f"{base_output_path}/Cedar_Sampled",
            'num_individuals': 5,
            'seeds': [123],
            'number_of_signatures': 5,
            'language': 'English'
        },
        'Signature_Verification': {
            'data_path': f"{base_data_path}/Signature_Verification_Dataset",
            'destination_path': f"{base_output_path}/Signature_Verification_Sampled",
            'num_individuals': 5,
            'seed': 123,
            'number_of_signatures': 5,
            'language': 'English'
        },
        'BHSig260_Bengali': {
            'data_path': f"{base_data_path}/BHSig260/Bengali",
            'destination_path': f"{base_output_path}/BHSig260_Bengali_Sampled",
            'num_individuals': 10,
            'seed': 101,
            'number_of_signatures': 5,
            'language': 'Bengali'
        },
        'BHSig260_Hindi': {
            'data_path': f"{base_data_path}/BHSig260/Hindi",
            'destination_path': f"{base_output_path}/BHSig260_Hindi_Sampled",
            'num_individuals': 8,
            'seed': 404,
            'number_of_signatures': 5,
            'language': 'Hindi'
        },
        'Real_Fake_Data': {
            'data_path': f"{base_data_path}/Real_Fake_Signature/Signature Images",
            'destination_path': f"{base_output_path}/Real_Fake_Signature_Sampled",
            'num_individuals': 3,
            'seed': 707,
            'number_of_signatures': 5,
            'language': 'Turkish'
        },
        'Hansig': {
            'data_path': f"{base_data_path}/Hansig",
            'destination_path': f"{base_output_path}/Hansig_Sampled",
            'num_individuals': 5,
            'seed': 555,
            'number_of_signatures': 5,
            'language': 'Chinese'
        }
    }

    preprocessed_df = run_sampling_methods_with_preprocessing(methods, params, base_output_path, steps, switches, hyperparams)



Running CEDAR with parameters: {'data_path': '/Users/hongmingfu/Desktop/Brown University/DATA2050/Cedar', 'destination_path': '/Users/hongmingfu/Desktop/Brown University/DATA2050/preprocessed_dataset/Cedar_Sampled', 'num_individuals': 5, 'seeds': [123], 'number_of_signatures': 5, 'language': 'English'}
Calling CEDAR with filtered parameters: {'data_path': '/Users/hongmingfu/Desktop/Brown University/DATA2050/Cedar', 'destination_path': '/Users/hongmingfu/Desktop/Brown University/DATA2050/preprocessed_dataset/Cedar_Sampled', 'num_individuals': 5, 'seeds': [123], 'number_of_signatures': 5}
Dataset reorganized and exported for seed 123 to: /Users/hongmingfu/Desktop/Brown University/DATA2050/preprocessed_dataset/Cedar_Sampled/random_seeds_123
CEDAR sampling completed. Proceeding with preprocessing...

Entries from CEDAR: 50
Total entries after CEDAR: 50

Running Signature_Verification with parameters: {'data_path': '/Users/hongmingfu/Desktop/Brown University/DATA2050/Signature_Verification_



Entries from Signature_Verification: 50
Total entries after Signature_Verification: 100

Running BHSig260_Bengali with parameters: {'data_path': '/Users/hongmingfu/Desktop/Brown University/DATA2050/BHSig260/Bengali', 'destination_path': '/Users/hongmingfu/Desktop/Brown University/DATA2050/preprocessed_dataset/BHSig260_Bengali_Sampled', 'num_individuals': 10, 'seed': 101, 'number_of_signatures': 5, 'language': 'Bengali'}
Calling BHSig260_Bengali with filtered parameters: {'data_path': '/Users/hongmingfu/Desktop/Brown University/DATA2050/BHSig260/Bengali', 'destination_path': '/Users/hongmingfu/Desktop/Brown University/DATA2050/preprocessed_dataset/BHSig260_Bengali_Sampled', 'num_individuals': 10, 'seed': 101, 'number_of_signatures': 5}
Restructuring complete! Data for 10 individuals with 5 genuine and forged signatures each has been organized in '/Users/hongmingfu/Desktop/Brown University/DATA2050/preprocessed_dataset/BHSig260_Bengali_Sampled/random_seeds_101'.
BHSig260_Bengali sampling

In [3]:
preprocessed_df

Unnamed: 0,Seed,Person ID/Name,Label,Image File
0,random_seeds_123,person_002,true,/Users/hongmingfu/Desktop/Brown University/DAT...
1,random_seeds_123,person_002,true,/Users/hongmingfu/Desktop/Brown University/DAT...
2,random_seeds_123,person_002,true,/Users/hongmingfu/Desktop/Brown University/DAT...
3,random_seeds_123,person_002,true,/Users/hongmingfu/Desktop/Brown University/DAT...
4,random_seeds_123,person_002,true,/Users/hongmingfu/Desktop/Brown University/DAT...
...,...,...,...,...
355,random_seeds_555,person_352,forged,/Users/hongmingfu/Desktop/Brown University/DAT...
356,random_seeds_555,person_352,forged,/Users/hongmingfu/Desktop/Brown University/DAT...
357,random_seeds_555,person_352,forged,/Users/hongmingfu/Desktop/Brown University/DAT...
358,random_seeds_555,person_352,forged,/Users/hongmingfu/Desktop/Brown University/DAT...


In [1]:
# Run the script and capture outputs
%run preprocessing_for_model.py

# Assuming preprocessing_for_model.py saves sampled_df and preprocessed_df as variables
# Display the DataFrames if they are accessible in the namespace after execution
try:
    display(sampled_df)
    display(preprocessed_df)
except NameError:
    print("sampled_df and preprocessed_df are not accessible. Ensure they are declared as global in the script.")


Running CEDAR with parameters: {'data_path': '/Users/hongmingfu/Desktop/Brown University/DATA2050/Cedar', 'destination_path': '/Users/hongmingfu/Desktop/Brown University/DATA2050/preprocessed_dataset/sampled/Cedar_Sampled', 'num_individuals': 5, 'seeds': [123], 'number_of_signatures': 5, 'language': 'English'}
Dataset reorganized and exported for seed 123 to: /Users/hongmingfu/Desktop/Brown University/DATA2050/preprocessed_dataset/sampled/CEDAR_Sampled/random_seeds_123
CEDAR sampling completed.

Running Signature_Verification with parameters: {'data_path': '/Users/hongmingfu/Desktop/Brown University/DATA2050/Signature_Verification_Dataset', 'destination_path': '/Users/hongmingfu/Desktop/Brown University/DATA2050/preprocessed_dataset/sampled/Signature_Verification_Sampled', 'num_individuals': 5, 'seed': 123, 'number_of_signatures': 5, 'language': 'English'}
Dataset reorganized and exported for seed 123 to: /Users/hongmingfu/Desktop/Brown University/DATA2050/preprocessed_dataset/sampled/



Saved preprocessed data to /Users/hongmingfu/Desktop/Brown University/DATA2050/preprocessed_dataset/preprocessed/preprocessed_info.csv


libpng error: Invalid IHDR data
libpng error: Invalid IHDR data
libpng error: Invalid IHDR data
libpng error: Invalid IHDR data
libpng error: Invalid IHDR data
libpng error: Invalid IHDR data
libpng error: Invalid IHDR data
libpng error: Invalid IHDR data
libpng error: Invalid IHDR data
libpng error: Invalid IHDR data
libpng error: Invalid IHDR data
libpng error: Invalid IHDR data
libpng error: Invalid IHDR data
libpng error: Invalid IHDR data
libpng error: Invalid IHDR data
libpng error: Invalid IHDR data
libpng error: Invalid IHDR data
libpng error: Invalid IHDR data
libpng error: Invalid IHDR data
libpng error: Invalid IHDR data
libpng error: Invalid IHDR data
libpng error: Invalid IHDR data
libpng error: Invalid IHDR data
libpng error: Invalid IHDR data
libpng error: Invalid IHDR data
libpng error: Invalid IHDR data
libpng error: Invalid IHDR data
libpng error: Invalid IHDR data
libpng error: Invalid IHDR data
libpng error: Invalid IHDR data
libpng error: Invalid IHDR data
libpng e

Saved preprocessed DataFrame for EfficientNet to /Users/hongmingfu/Desktop/Brown University/DATA2050/preprocessed_dataset/preprocessed/EfficientNet/preprocessed_signature_df_EfficientNet.pkl
Saved triplets for EfficientNet to /Users/hongmingfu/Desktop/Brown University/DATA2050/preprocessed_dataset/preprocessed/EfficientNet/preprocessed_triplets.npy


libpng error: Invalid IHDR data
libpng error: Invalid IHDR data


Unnamed: 0,Data Source,Language,Seed,Person ID/Name,Class,Image ID,Image File
0,CEDAR,English,person_034,forge,FORGED,1,/Users/hongmingfu/Desktop/Brown University/DAT...
1,CEDAR,English,person_034,forge,FORGED,3,/Users/hongmingfu/Desktop/Brown University/DAT...
2,CEDAR,English,person_034,forge,FORGED,2,/Users/hongmingfu/Desktop/Brown University/DAT...
3,CEDAR,English,person_034,forge,FORGED,5,/Users/hongmingfu/Desktop/Brown University/DAT...
4,CEDAR,English,person_034,forge,FORGED,4,/Users/hongmingfu/Desktop/Brown University/DAT...
...,...,...,...,...,...,...,...
525,Hansig,Chinese,person_352,true,TRUE,5,/Users/hongmingfu/Desktop/Brown University/DAT...
526,Hansig,Chinese,person_352,true,TRUE,4,/Users/hongmingfu/Desktop/Brown University/DAT...
527,Hansig,Chinese,person_352,true,TRUE,1,/Users/hongmingfu/Desktop/Brown University/DAT...
528,Hansig,Chinese,person_352,true,TRUE,3,/Users/hongmingfu/Desktop/Brown University/DAT...


Unnamed: 0,Data Source,Language,Seed,Person ID/Name,Class,Image ID,Image File
0,Real_Fake_Data_Sampled,Sampled,random_seeds_707,person_01,FORGED,20,/Users/hongmingfu/Desktop/Brown University/DAT...
1,Real_Fake_Data_Sampled,Sampled,random_seeds_707,person_01,FORGED,37,/Users/hongmingfu/Desktop/Brown University/DAT...
2,Real_Fake_Data_Sampled,Sampled,random_seeds_707,person_01,FORGED,26,/Users/hongmingfu/Desktop/Brown University/DAT...
3,Real_Fake_Data_Sampled,Sampled,random_seeds_707,person_01,FORGED,4,/Users/hongmingfu/Desktop/Brown University/DAT...
4,Real_Fake_Data_Sampled,Sampled,random_seeds_707,person_01,FORGED,14,/Users/hongmingfu/Desktop/Brown University/DAT...
...,...,...,...,...,...,...,...
525,Signature_Verification_Sampled,Sampled,random_seeds_123,person_047,TRUE,4,/Users/hongmingfu/Desktop/Brown University/DAT...
526,Signature_Verification_Sampled,Sampled,random_seeds_123,person_047,TRUE,5,/Users/hongmingfu/Desktop/Brown University/DAT...
527,Signature_Verification_Sampled,Sampled,random_seeds_123,person_047,TRUE,1,/Users/hongmingfu/Desktop/Brown University/DAT...
528,Signature_Verification_Sampled,Sampled,random_seeds_123,person_047,TRUE,2,/Users/hongmingfu/Desktop/Brown University/DAT...


In [2]:
preprocessed_df

Unnamed: 0,Label,Original Path,Preprocessed Path
0,true,/Users/hongmingfu/Desktop/Brown University/DAT...,/Users/hongmingfu/Desktop/Brown University/DAT...
1,true,/Users/hongmingfu/Desktop/Brown University/DAT...,/Users/hongmingfu/Desktop/Brown University/DAT...
2,true,/Users/hongmingfu/Desktop/Brown University/DAT...,/Users/hongmingfu/Desktop/Brown University/DAT...
3,true,/Users/hongmingfu/Desktop/Brown University/DAT...,/Users/hongmingfu/Desktop/Brown University/DAT...
4,true,/Users/hongmingfu/Desktop/Brown University/DAT...,/Users/hongmingfu/Desktop/Brown University/DAT...
...,...,...,...
355,true,/Users/hongmingfu/Desktop/Brown University/DAT...,/Users/hongmingfu/Desktop/Brown University/DAT...
356,true,/Users/hongmingfu/Desktop/Brown University/DAT...,/Users/hongmingfu/Desktop/Brown University/DAT...
357,true,/Users/hongmingfu/Desktop/Brown University/DAT...,/Users/hongmingfu/Desktop/Brown University/DAT...
358,true,/Users/hongmingfu/Desktop/Brown University/DAT...,/Users/hongmingfu/Desktop/Brown University/DAT...


In [3]:
sampled_df

Unnamed: 0,Method,Seed,Person Folder,Image File
0,CEDAR,[123],forge,/Users/hongmingfu/Desktop/Brown University/DAT...
1,CEDAR,[123],forge,/Users/hongmingfu/Desktop/Brown University/DAT...
2,CEDAR,[123],forge,/Users/hongmingfu/Desktop/Brown University/DAT...
3,CEDAR,[123],forge,/Users/hongmingfu/Desktop/Brown University/DAT...
4,CEDAR,[123],forge,/Users/hongmingfu/Desktop/Brown University/DAT...
...,...,...,...,...
355,Hansig,555,true,/Users/hongmingfu/Desktop/Brown University/DAT...
356,Hansig,555,true,/Users/hongmingfu/Desktop/Brown University/DAT...
357,Hansig,555,true,/Users/hongmingfu/Desktop/Brown University/DAT...
358,Hansig,555,true,/Users/hongmingfu/Desktop/Brown University/DAT...


In [8]:
import pandas as pd

# Load the preprocessed DataFrame
preprocessed_df = pd.read_pickle('/Users/hongmingfu/Desktop/Brown University/DATA2050/preprocessed_dataset/preprocessed/EfficientNet/preprocessed_signature_df_EfficientNet.pkl')
preprocessed_df
# Display the DataFrame



Unnamed: 0,person_id,image,label
0,person_01,"[[[2.2489082969432315, 2.428571428571429, 2.63...",1
1,person_01,"[[[-2.1179039301310043, -2.0357142857142856, -...",1
2,person_01,"[[[-2.1179039301310043, -2.0357142857142856, -...",1
3,person_01,"[[[-2.1179039301310043, -2.0357142857142856, -...",1
4,person_01,"[[[-2.1179039301310043, -2.0357142857142856, -...",1
...,...,...,...
355,person_022,"[[[-2.1179039301310043, -2.0357142857142856, -...",0
356,person_022,"[[[-2.1179039301310043, -2.0357142857142856, -...",0
357,person_022,"[[[-2.1179039301310043, -2.0357142857142856, -...",0
358,person_022,"[[[-2.1179039301310043, -2.0357142857142856, -...",0


In [13]:
import numpy as np

def check_images_identical(df, index1, index2):
    """
    Check if two images in the DataFrame are identical.
    
    Args:
        df (DataFrame): The DataFrame containing image data.
        index1 (int): Index of the first image.
        index2 (int): Index of the second image.
    
    Returns:
        bool: True if the images are identical, False otherwise.
    """
    image1 = df['image'].iloc[index1]
    image2 = df['image'].iloc[index2]
    
    identical = np.array_equal(image1, image2)
    
    if identical:
        print(f"Images at index {index1} and {index2} are identical.")
    else:
        print(f"Images at index {index1} and {index2} are not identical.")
    
    return identical

# Example usage: check if images at the last two indices are identical
check_images_identical(preprocessed_df, 4, 2)


Images at index 4 and 2 are not identical.


False