In [1]:
import cv2
import os

INPUT_DIR = "Dataset"
OUTPUT_DIR = "Dataset_Faces"
IMG_SIZE = (224, 224)
MIN_FACE_SIZE = (80, 80)

# Load Haar Cascade
face_cascade = cv2.CascadeClassifier(
    cv2.data.haarcascades + "haarcascade_frontalface_default.xml"
)

# Create output base directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Loop through each person folder
for person_name in os.listdir(INPUT_DIR):
    person_path = os.path.join(INPUT_DIR, person_name)

    if not os.path.isdir(person_path):
        continue

    print(f"Processing: {person_name}")

    # Create output folder for this person
    out_person_dir = os.path.join(OUTPUT_DIR, person_name)
    os.makedirs(out_person_dir, exist_ok=True)

    img_count = 0

    # Loop through images
    for img_name in os.listdir(person_path):
        if not img_name.lower().endswith((".jpg", ".png", ".jpeg")):
            continue

        img_path = os.path.join(person_path, img_name)
        img = cv2.imread(img_path)

        if img is None:
            continue

        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        faces = face_cascade.detectMultiScale(
            gray,
            scaleFactor=1.2,
            minNeighbors=5,
            minSize=MIN_FACE_SIZE
        )

        # If no face found, skip
        if len(faces) == 0:
            print(f"No face: {img_name}")
            continue

        # Take the largest face 
        faces = sorted(faces, key=lambda x: x[2]*x[3], reverse=True)
        x, y, w, h = faces[0]

        face = img[y:y+h, x:x+w]
        face = cv2.resize(face, IMG_SIZE)

        save_name = f"{person_name}_{img_count}.jpg"
        save_path = os.path.join(out_person_dir, save_name)
        cv2.imwrite(save_path, face)

        img_count += 1

    print(f"Saved {img_count} faces\n")

print("All faces extracted successfully!")


Processing: 0
No face: IMG_20250605_151229.jpg
No face: IMG_20250926_091238.jpg
No face: IMG_20251116_150437.jpg
No face: IMG_20251217_220756(1).jpg
No face: IMG_20251217_220756.jpg
No face: IMG_20251217_220917(1).jpg
No face: IMG_20251217_220917.jpg
No face: IMG_20251217_220921_1(1).jpg
No face: IMG_20251217_220921_1.jpg
No face: IMG_20251217_220958.jpg
No face: IMG_20251217_221150.jpg
No face: IMG_20251217_221205.jpg
No face: IMG_20251217_221207.jpg
No face: IMG_20251217_221220.jpg
No face: IMG_20251217_221222.jpg
No face: IMG_20251217_221227.jpg
No face: IMG_20251217_221230.jpg
No face: IMG_20251217_221240.jpg
No face: IMG_20251217_221242.jpg
No face: IMG_20251217_221343.jpg
No face: IMG_20251217_221351.jpg
No face: IMG_20251217_221400.jpg
No face: IMG_20251217_222417.jpg
Saved 164 faces

Processing: 1
No face: PXL_20241216_104617691.MP.jpg
No face: PXL_20241216_104622268.MP.jpg
No face: PXL_20241216_104626759.MP.jpg
No face: PXL_20241216_104627533.jpg
No face: PXL_20241216_10462896

In [1]:
import os
import cv2
import random
import shutil
import numpy as np

INPUT_DIR = "Dataset_Faces"
OUTPUT_DIR = "Final_Dataset"

TRAIN_RATIO = 0.9
TARGET_TRAIN_COUNT = 520  
IMG_SIZE = (224, 224)

SEED = 42
random.seed(SEED)

def ensure_dir(path):
    os.makedirs(path, exist_ok=True)

def augment(img):
    aug = []

    # flip
    aug.append(cv2.flip(img, 1))

    # rotations
    for angle in [-10, 10]:
        M = cv2.getRotationMatrix2D((112,112), angle, 1.0)
        aug.append(cv2.warpAffine(img, M, IMG_SIZE))

    # brightness
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    hsv[:,:,2] = np.clip(hsv[:,:,2] * 1.2, 0, 255)
    aug.append(cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR))

    return aug

# TRAIN - TEST SPLIT 
print("90:10 train-test split")

for cls in ["0", "1", "2"]:
    cls_path = os.path.join(INPUT_DIR, cls)
    images = [f for f in os.listdir(cls_path)
              if f.lower().endswith((".jpg",".png",".jpeg"))]

    random.shuffle(images)
    split_idx = int(len(images) * TRAIN_RATIO)

    train_imgs = images[:split_idx]
    test_imgs  = images[split_idx:]

    for split, img_list in zip(["train_raw", "test"], [train_imgs, test_imgs]):
        out_dir = os.path.join(OUTPUT_DIR, split, cls)
        ensure_dir(out_dir)

        for img in img_list:
            shutil.copy(
                os.path.join(cls_path, img),
                os.path.join(out_dir, img)
            )

print("Train-Test split done\n")

# OFFLINE AUGMENT TRAIN 
print("Offline augmentation to 520 images/class")

RAW_TRAIN = os.path.join(OUTPUT_DIR, "train_raw")
FINAL_TRAIN = os.path.join(OUTPUT_DIR, "train")

for cls in ["0", "1", "2"]:
    in_dir = os.path.join(RAW_TRAIN, cls)
    out_dir = os.path.join(FINAL_TRAIN, cls)
    ensure_dir(out_dir)

    imgs = [f for f in os.listdir(in_dir)
            if f.lower().endswith((".jpg",".png",".jpeg"))]

    saved = 0

    # Save originals first
    for img_name in imgs:
        img = cv2.imread(os.path.join(in_dir, img_name))
        if img is None:
            continue

        cv2.imwrite(os.path.join(out_dir, img_name), img)
        saved += 1

    idx = 0
    while saved < TARGET_TRAIN_COUNT:
        img_name = imgs[idx % len(imgs)]
        img = cv2.imread(os.path.join(in_dir, img_name))
        if img is None:
            idx += 1
            continue

        for aug in augment(img):
            if saved >= TARGET_TRAIN_COUNT:
                break
            save_name = f"aug_{saved}.jpg"
            cv2.imwrite(os.path.join(out_dir, save_name), aug)
            saved += 1

        idx += 1

    print(f"Class {cls}: {saved} train images created")

print("\nFINAL DATASET READY")


90:10 train-test split


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'Dataset_Faces\\0'