In [4]:
import os
from sklearn.model_selection import train_test_split
import shutil
from PIL import Image

In [5]:
# Setup paths
dataset_path = r"D:\Final project\Scalp_Sense\dataset"
output_path = os.path.join(os.getcwd(), "processed_dataset")
splits = ["training", "validation", "testing"]
genders = ["male", "female"]

# Create split folders
for split in splits:
    os.makedirs(os.path.join(output_path, split), exist_ok=True)

# Function to split and copy images
def split_and_copy(images, src_path, dst_base_path, gender, category):
    train_imgs, temp_imgs = train_test_split(images, test_size=0.3, random_state=42)
    val_imgs, test_imgs = train_test_split(temp_imgs, test_size=0.5, random_state=42)
    for img_set, split in zip([train_imgs, val_imgs, test_imgs], splits):
        dest_path = os.path.join(dst_base_path, split, gender, category)
        os.makedirs(dest_path, exist_ok=True)
        for img in img_set:
            shutil.copy(os.path.join(src_path, img), os.path.join(dest_path, img))

In [6]:
# Main processing
for gender in genders:
    gender_path = os.path.join(dataset_path, gender)
    for subfolder in os.listdir(gender_path):
        subfolder_path = os.path.join(gender_path, subfolder)

        if not os.path.isdir(subfolder_path):
            continue

        images = [f for f in os.listdir(subfolder_path) if not f.startswith('.')]
        
        if subfolder.lower() == "non-scalp":
            print(f"Processing non-scalp images for {gender}")
            split_and_copy(images, subfolder_path, output_path, gender, "non-scalp")
        else:
            print(f"Processing stage {subfolder} for {gender}")
            split_and_copy(images, subfolder_path, output_path, gender, subfolder)

print("Dataset split and copy complete.")

Processing non-scalp images for male
Processing stage stage 1 for male
Processing stage stage 2 for male
Processing stage stage 3 for male
Processing stage stage 4 for male
Processing stage stage 5 for male
Processing stage stage 6 for male
Processing non-scalp images for female
Processing stage stage 1 for female
Processing stage stage 2 for female
Processing stage stage 3 for female
Processing stage stage 4 for female
Processing stage stage 5 for female
Dataset split and copy complete.


In [8]:
# Image compression and corruption check
Image.MAX_IMAGE_PIXELS = None
MAX_WIDTH = 3000
MAX_HEIGHT = 3000
COMPRESSION_QUALITY = 85

def compress_image(img_path):
    try:
        img = Image.open(img_path)
        if img.size[0] > MAX_WIDTH or img.size[1] > MAX_HEIGHT:
            img.thumbnail((MAX_WIDTH, MAX_HEIGHT))
        img.save(img_path, "JPEG", quality=COMPRESSION_QUALITY)
    except Exception as e:
        print(f"Error compressing {img_path}: {e}")
        os.remove(img_path)

# Validate and compress
for split in splits:
    print(f"Checking {split} set for corrupt or oversized images...")
    for gender in genders:
        split_gender_path = os.path.join(output_path, split, gender)
        if not os.path.exists(split_gender_path):
            continue
        for category in os.listdir(split_gender_path):
            category_path = os.path.join(split_gender_path, category)
            for img_name in os.listdir(category_path):
                img_path = os.path.join(category_path, img_name)
                if img_name.startswith('.'):
                    continue
                try:
                    img = Image.open(img_path)
                    img.verify()
                    img.close()
                    compress_image(img_path)
                except (IOError, SyntaxError):
                    print(f"Removing corrupt image: {img_path}")
                    os.remove(img_path)

print("Image compression and corruption check complete.")

Checking training set for corrupt or oversized images...
Checking validation set for corrupt or oversized images...
Checking testing set for corrupt or oversized images...
Image compression and corruption check complete.


In [9]:
# Function to count images in a given folder
def count_images(folder):
    count = 0
    for root, _, files in os.walk(folder):
        count += len(files)
    return count

# Count images in the train, val, and test directories for both men and women
train_path_men = os.path.join(output_path, "training", "male")
train_path_women = os.path.join(output_path, "training", "female")
val_path_men = os.path.join(output_path, "validation", "male")
val_path_women = os.path.join(output_path, "validation", "female")
test_path_men = os.path.join(output_path, "testing", "male")
test_path_women = os.path.join(output_path, "testing", "female")

# Print the number of images in each category
print("Training images (male):", count_images(train_path_men))
print("Training images (female):", count_images(train_path_women))
print("Validation images (male):", count_images(val_path_men))
print("Validation images (female):", count_images(val_path_women))
print("Testing images (male):", count_images(test_path_men))
print("Testing images (female):", count_images(test_path_women))

Training images (male): 4753
Training images (female): 2952
Validation images (male): 1015
Validation images (female): 634
Testing images (male): 1022
Testing images (female): 637
