<a href="https://colab.research.google.com/github/CarmenTheodoraCraciun/HairTextureClassification/blob/main/DataProcessing_HairTexture.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install opencv-python
!pip install tensorflow

Collecting opencv-python
  Downloading opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Downloading opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (63.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.0/63.0 MB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: opencv-python
Successfully installed opencv-python-4.11.0.86
Collecting tensorflow
  Downloading tensorflow-2.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  D

# Enviroment

In [2]:
!rm -rf ./HairTextureClassification
!git clone https://github.com/CarmenTheodoraCraciun/HairTextureClassification.git

Cloning into 'HairTextureClassification'...
remote: Enumerating objects: 30880, done.[K
remote: Counting objects: 100% (45/45), done.[K
remote: Compressing objects: 100% (39/39), done.[K
remote: Total 30880 (delta 23), reused 8 (delta 6), pack-reused 30835 (from 4)[K
Receiving objects: 100% (30880/30880), 824.64 MiB | 68.52 MiB/s, done.
Resolving deltas: 100% (69/69), done.
Updating files: 100% (16610/16610), done.


In [6]:
import os
import cv2
import shutil
import gc
import numpy as np
from collections import Counter
import tensorflow as tf
from tensorflow.keras import mixed_precision
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from google.colab import files
from PIL import Image

##Optimizing the training environment

* Enabling mixed precision for optimal performance on the GPU

In [4]:
mixed_precision.set_global_policy('mixed_float16')

##Cleaning up the TensorFlow session

* to avoid memory issues

In [5]:
tf.keras.backend.clear_session()
collected = gc.collect()
print(f"Garbage collector freed {collected} unreachable objects.")

Garbage collector freed 0 unreachable objects.


#Data distribution after the processing

In [10]:
dataset_dir = './HairTextureClassification/originalData'

# Supported image extensions
image_extensions = ('.jpg', '.jpeg', '.png')

# Store file paths and labels
image_paths = []
labels = []

# Traverse dataset directory
for label in os.listdir(dataset_dir):
    class_dir = os.path.join(dataset_dir, label)
    if os.path.isdir(class_dir):
        for fname in os.listdir(class_dir):
            if fname.lower().endswith(image_extensions):
                image_paths.append(os.path.join(class_dir, fname))
                labels.append(label)

# Dataset size
dataset_size = len(image_paths)
print(f"Total images in dataset: {dataset_size}")

# Class distribution
class_counts = Counter(labels)
print("Class distribution:")
for cls, count in sorted(class_counts.items()):
    print(f"  {cls}: {count} images")

# Optional: Check average image dimensions
image_sizes = []
for path in image_paths[:100]:  # limit to first 100 for performance
    try:
        with Image.open(path) as img:
            image_sizes.append(img.size)
    except Exception as e:
        print(f"Error reading {path}: {e}")

if image_sizes:
    avg_width = sum(w for w, h in image_sizes) / len(image_sizes)
    avg_height = sum(h for w, h in image_sizes) / len(image_sizes)
    print(f"Average image size (from first 100 samples): {int(avg_width)}x{int(avg_height)} pixels")

Total images in dataset: 3616
Class distribution:
  curly: 2060 images
  dreadlocks: 467 images
  kinky: 232 images
  straight: 527 images
  wavy: 330 images
Average image size (from first 100 samples): 547x651 pixels


# Data processing

##Resizes the original images

In [None]:
def is_image(file_path):
    try:
        img = cv2.imread(file_path)
        return img is not None
    except:
        return False

def resize_image(img, size):
    """
    Resizes an image using bilinear interpolation.

    Args:
        img: Input image.
        size: Desired size of the output image (width, height).

    Returns:
        Resized image.
    """
    original_height, original_width, _ = img.shape
    new_width, new_height = size
    resized_img = np.zeros((new_height, new_width, 3), dtype=np.uint8)

    for i in range(new_width):
        for j in range(new_height):
            #i, j = pixel in the resized image
            # x, y = pixel in the original image
            x = i * (original_width - 1) / (new_width - 1)
            y = j * (original_height - 1) / (new_height - 1)

            # Neighborhood values
            x0 = int(np.floor(x))
            x1 = min(x0 + 1, original_width - 1)
            y0 = int(np.floor(y))
            y1 = min(y0 + 1, original_height - 1)

            # Extract the intensity values ​​of neighbors
            Ia = img[y0, x0] # stanga sus
            Ib = img[y0, x1] # drepata sus
            Ic = img[y1, x0] # stanga jos
            Id = img[y1, x1] # dreapta jos

            # Calculates the weight of each neighboring to the final value
            wa = (x1 - x) * (y1 - y)
            wb = (x - x0) * (y1 - y)
            wc = (x1 - x) * (y - y0)
            wd = (x - x0) * (y - y0)

            # The final value of the new pixel
            pixel = wa * Ia + wb * Ib + wc * Ic + wd * Id
            resized_img[j, i] = np.round(pixel).astype(int)

    return resized_img

def preprocess_images(input_dir, size=(96, 96)):
    """
    Resizes images to the specified size and stores them in memory.

    Args:
        input_dir: Directory containing the input images.
        size: Desired size of the output images (width, height).

    Returns:
        images: List of processed images as NumPy arrays.
        labels: List of corresponding labels.
    """
    images = []
    labels = []

    print("Start processing data.")
    for category in os.listdir(input_dir):
        category_dir = os.path.join(input_dir, category)
        if not os.path.isdir(category_dir):
            continue

        num_images = 0  # Contor pentru imagini per categorie
        for idx, img_name in enumerate(os.listdir(category_dir)):
            img_path = os.path.join(category_dir, img_name)

            if is_image(img_path):
                img = cv2.imread(img_path)
                if img is not None:
                    img = resize_image(img, size)
                    images.append(img)
                    labels.append(category)
                    num_images += 1
                else:
                    print(f"Failed to load image: {img_path}")
            else:
                print(f"Not an image: {img_path}")

        print(f"Folder {category_dir} has {num_images} images.")

    return np.array(images), np.array(labels)

In [None]:
processed_images, processed_labels = preprocess_images('./HairTextureClassification/originalData', size=(96, 96))

Start processing data.
Folder ./HairTextureClassification/originalData/dreadlocks has 467 images.
Not an image: ./HairTextureClassification/originalData/curly/rs_1080x1080-200330130638-1080-ariana-grande-curly-hair-instagram-am-033020.gif
Folder ./HairTextureClassification/originalData/curly has 2060 images.
Folder ./HairTextureClassification/originalData/wavy has 331 images.
Folder ./HairTextureClassification/originalData/kinky has 232 images.
Folder ./HairTextureClassification/originalData/straight has 530 images.


## Oversampling for rare classes and Normalization

* Classes like straight, curly, dreadlocks have over more images.
* normalization: to contain only 0 and 1 values


In [None]:
def augumentation_images(images, labels, max_images_per_class=2000, size=(96, 96)):
    class_counts_dict = Counter(labels)
    class_counts = list(class_counts_dict.values())
    class_names = list(class_counts_dict.keys())

    augment_multiplier = {cls: max(1, max_images_per_class // count) for cls, count in zip(class_names, class_counts)}

    print("Augment multipliers:", augment_multiplier)

    datagen = ImageDataGenerator(
        rotation_range=20,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest',
    )

    augmented_images = []
    augmented_labels = []

    for img, label in zip(images, labels):
        img = img[np.newaxis, ...]  # Expand dimensions for datagen flow
        gen = datagen.flow(img, batch_size=1)

        num_augmentations = augment_multiplier[label]

        for _ in range(num_augmentations):
            aug_img = next(gen)[0]
            aug_img = cv2.resize(aug_img, size, interpolation=cv2.INTER_LINEAR)
            augmented_images.append(aug_img)
            augmented_labels.append(label)

    images_augmented = np.array(augmented_images)
    labels_augmented = np.array(augmented_labels)

    print("The new class distribution:", Counter(labels_augmented))
    print(f"Enlarged image dimensions: {images_augmented.shape}")
    print(f"Enlarged label sizes: {labels_augmented.shape}")

    return images_augmented, labels_augmented


In [None]:
images_augmented, labels_augmented = augumentation_images(processed_images, processed_labels, 2000)

Augment multipliers: {np.str_('dreadlocks'): 4, np.str_('curly'): 1, np.str_('wavy'): 6, np.str_('kinky'): 8, np.str_('straight'): 3}
The new class distribution: Counter({np.str_('curly'): 2060, np.str_('wavy'): 1986, np.str_('dreadlocks'): 1868, np.str_('kinky'): 1856, np.str_('straight'): 1590})
Enlarged image dimensions: (9360, 96, 96, 3)
Enlarged label sizes: (9360,)


In [None]:
def save_images_to_folders(images, labels, images_augmented, labels_augmented, output_dir="HairTextureClassification/processedData"):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    class_counts = {}

    for img, label in zip(images, labels):
        assert img.shape[:2] == (96, 96), f"Error: Original image {label} is not 96x96!"
        class_dir = os.path.join(output_dir, label)
        if not os.path.exists(class_dir):
            os.makedirs(class_dir)

        class_counts[label] = class_counts.get(label, 0) + 1
        file_name = f"{label}_{class_counts[label]}.png"

        cv2.imwrite(os.path.join(class_dir, file_name), img.astype(np.uint8))

    for img, label in zip(images_augmented, labels_augmented):
        assert img.shape[:2] == (96, 96), f"Error: Augmented image {label} is not 96x96!"
        class_dir = os.path.join(output_dir, label)
        if not os.path.exists(class_dir):
            os.makedirs(class_dir)

        class_counts[label] += 1
        file_name = f"{label}_{class_counts[label]}.png"

        cv2.imwrite(os.path.join(class_dir, file_name), img.astype(np.uint8))

    print(f"Saved original and augmented images in {output_dir}")

In [None]:
save_images_to_folders(processed_images, processed_labels, images_augmented, labels_augmented)
shutil.

make_archive("processedData", "zip", "HairTextureClassification/processedData")
files.download("processedData.zip")

Saved original and augmented images in HairTextureClassification/processedData


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>