In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

input_csv = "DATA/face_detection_label.csv"
train_csv = "DATA/face_detection_train_data.csv"
test_csv = "DATA/Test_Face.csv"
train_ratio = 0.7

df = pd.read_csv(input_csv)

bins = [0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 100]
labels = ['lt6', '6-12', '12-18', '18-24', '24-30', '30-36', '36-42', '42-48', '48-54', '54-60', '60plus']
df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels)

group_counts = df['age_group'].value_counts()
valid_groups = group_counts[group_counts >= 2].index
print(df['age_group'].value_counts())


df = df[df['age_group'].isin(valid_groups)]

train_df, test_df = train_test_split(
    df,
    train_size=train_ratio,
    stratify=df['age_group'],
    shuffle=True,
    random_state=42
)

train_df['image'] = train_df['image'].apply(os.path.basename)
test_df['image'] = test_df['image'].apply(os.path.basename)

train_df.to_csv(train_csv, index=False)
test_df.to_csv(test_csv, index=False)

print(f"Đã chia xong: {len(train_df)} train / {len(test_df)} test")


age_group
24-30     5121
30-36     2551
18-24     2044
60plus    1875
36-42     1598
48-54     1270
lt6       1087
54-60     1014
42-48     1012
12-18      850
6-12       663
Name: count, dtype: int64
Đã chia xong: 13359 train / 5726 test


In [10]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np
import random
from collections import defaultdict
from PIL import Image

input_csv = "DATA/face_detection_train_data.csv"
output_dir = r"DATA\UTKFace\Train_Aug_Face"
aug_csv_path = "DATA/Train_Aug_Face.csv"

os.makedirs(output_dir, exist_ok=True)

df = pd.read_csv(input_csv)

group_counts = df['age_group'].value_counts()
max_count = group_counts.max()

def augment_image(image):
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_brightness(image, max_delta=0.3)
    image = tf.image.random_contrast(image, 0.7, 1.3)
    image = tf.image.random_saturation(image, 0.7, 1.3)
    image = tf.image.random_hue(image, 0.1)
    return image

def load_and_preprocess(path):
    img = tf.io.read_file(path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, [224, 224])
    img = img / 255.0
    return img

save_count = defaultdict(int)
grouped = df.groupby('age_group')
augmented_data = []


for label, group in grouped:
    image_root = "DATA/UTKFace/Face_Detection"
    paths = [os.path.join(image_root, fname) for fname in group['image'].tolist()]
    current = len(paths)
    needed = max_count - current

    for path in paths:
        try:
            img = load_and_preprocess(path)
            img = img * 255.0
            img = tf.cast(img, tf.uint8)
            filename = f"{label}_{save_count[label]:05d}.jpg"
            save_path = os.path.join(output_dir, filename)
            tf.keras.utils.save_img(save_path, img.numpy())
            augmented_data.append((filename, label))
            save_count[label] += 1
        except Exception as e:
            print(f"❌ Lỗi đọc ảnh: {path} ({e})")

    for _ in range(needed):
        path = random.choice(paths)
        try:
            img = load_and_preprocess(path)
            img = augment_image(img)
            img = img * 255.0
            img = tf.clip_by_value(img, 0, 255)
            img = tf.cast(img, tf.uint8)
            filename = f"{label}_{save_count[label]:05d}.jpg"
            save_path = os.path.join(output_dir, filename)
            tf.keras.utils.save_img(save_path, img.numpy())
            augmented_data.append((filename, label))
            save_count[label] += 1
        except Exception as e:
            print(f"❌ Lỗi augment ảnh: {path} ({e})")

print("Imaged saved: ", output_dir)

aug_df = pd.DataFrame(augmented_data, columns=["image_path", "age_group"])
aug_df.to_csv(aug_csv_path, index=False)
print("File_path saved", aug_csv_path)

print(aug_df['age_group'].value_counts())


Imaged saved:  DATA\UTKFace\Train_Aug_Face
File_path saved DATA/Train_Aug_Face.csv
age_group
12-18     3585
18-24     3585
24-30     3585
30-36     3585
36-42     3585
42-48     3585
48-54     3585
54-60     3585
6-12      3585
60plus    3585
lt6       3585
Name: count, dtype: int64
