In [3]:
import cv2
import numpy as np
import os
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

RAW_DATA_DIR = "../Dataset/train"
PROCESSED_DATA_DIR = "../Processed"
IMAGE_SIZE = (128, 128)
SKIN_TONE_CLASSES = ["White", "Brown", "Black"]

YCrCb_min = np.array([0, 133, 77], np.uint8)
YCrCb_max = np.array([255, 173, 127], np.uint8)

HSV_min = np.array([0, 40, 50], np.uint8)
HSV_max = np.array([50, 255, 255], np.uint8)

def detect_skin(image):
    image_YCrCb = cv2.cvtColor(image, cv2.COLOR_BGR2YCrCb)
    image_HSV = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)

    mask_YCrCb = cv2.inRange(image_YCrCb, YCrCb_min, YCrCb_max)
    mask_HSV = cv2.inRange(image_HSV, HSV_min, HSV_max)

    skin_mask = cv2.bitwise_and(mask_YCrCb, mask_HSV)

    skin = cv2.bitwise_and(image, image, mask=skin_mask)
    
    return skin

def load_and_preprocess_data():
    images, labels = [], []

    for class_index, class_name in enumerate(SKIN_TONE_CLASSES):
        class_dir = os.path.join(RAW_DATA_DIR, class_name)
        if not os.path.exists(class_dir):
            print(f"Warning: {class_dir} does not exist!")
            continue
        
        image_files = [f for f in os.listdir(class_dir) if f.endswith('.jpg')]
        print(f"Processing {class_name}: {len(image_files)} images")

        for image_name in image_files:
            image_path = os.path.join(class_dir, image_name)
            image = cv2.imread(image_path)

            if image is not None:
                skin_image = detect_skin(image)
                
                skin_image = cv2.resize(skin_image, IMAGE_SIZE)

                skin_image = skin_image / 255.0

                images.append(skin_image)
                labels.append(class_index)
            else:
                print(f"Warning: Can not read image {image_path}")

    images = np.array(images)
    labels = to_categorical(np.array(labels), num_classes=len(SKIN_TONE_CLASSES))

    X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.2, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    return (X_train, y_train), (X_val, y_val), (X_test, y_test)

def save_processed_data(X_train, y_train, X_val, y_val, X_test, y_test):
    if not os.path.exists(PROCESSED_DATA_DIR):
        os.makedirs(PROCESSED_DATA_DIR)

    np.save(os.path.join(PROCESSED_DATA_DIR, 'X_train.npy'), X_train)
    np.save(os.path.join(PROCESSED_DATA_DIR, 'y_train.npy'), y_train)
    np.save(os.path.join(PROCESSED_DATA_DIR, 'X_val.npy'), X_val)
    np.save(os.path.join(PROCESSED_DATA_DIR, 'y_val.npy'), y_val)
    np.save(os.path.join(PROCESSED_DATA_DIR, 'X_test.npy'), X_test)
    np.save(os.path.join(PROCESSED_DATA_DIR, 'y_test.npy'), y_test)

def main():
    print("Starting data preprocessing...")
    (X_train, y_train), (X_val, y_val), (X_test, y_test) = load_and_preprocess_data()
    print(f"Train: {X_train.shape}")
    print(f"Validation: {X_val.shape}")
    print(f"Test: {X_test.shape}")
    save_processed_data(X_train, y_train, X_val, y_val, X_test, y_test)
    print("Data preprocessing completed successfully!")

if __name__ == "__main__":
    main()


Starting data preprocessing...
Processing White: 500 images
Processing Brown: 500 images
Processing Black: 500 images
Train: (960, 128, 128, 3)
Validation: (240, 128, 128, 3)
Test: (300, 128, 128, 3)
Data preprocessing completed successfully!
