In [None]:
# train.py - Age and Gender Prediction
import os
import re
import glob
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf

# ----------------------------
# Config
# ----------------------------
DATA_DIR = r"\Users\Diptanu Sarkar\Desktop\Age Detection\UTKFace"
IMG_SIZE = 224
BATCH_SIZE = 32
EPOCHS = 30
SEED = 42
MODEL_OUT = "age_gender_cnn_utkface.h5"

# ----------------------------
# Utilities
# ----------------------------
def parse_filename_info(path):
    """
    Parse UTKFace filename: [age]_[gender]_[race]_[date&time].jpg
    Returns: (age, gender) or (None, None) if parsing fails
    """
    fname = os.path.basename(path)
    parts = fname.split('_')
    
    if len(parts) < 3:
        return None, None
    
    try:
        age = int(parts[0])
        gender = int(parts[1])  # 0=male, 1=female
        
        # Validate ranges
        if 0 <= age <= 100 and gender in [0, 1]:
            return age, gender
        else:
            return None, None
    except ValueError:
        return None, None

def list_images_and_labels(data_dir):
    """Extract image paths with age and gender labels"""
    # Look for images in the main directory and subdirectories
    patterns = [
        os.path.join(data_dir, "*.jpg"),
        os.path.join(data_dir, "*", "*.jpg")
    ]
    
    files = []
    for pattern in patterns:
        files.extend(glob.glob(pattern))
    
    print(f"Found {len(files)} images.")
    if len(files) > 0:
        print("Example image paths:", files[:5])

    paths, ages, genders = [], [], []
    
    for path in files:
        age, gender = parse_filename_info(path)
        if age is not None and gender is not None:
            paths.append(path)
            ages.append(age)
            genders.append(gender)
    
    print(f"Valid images with labels: {len(paths)}")
    
    return np.array(paths), np.array(ages, dtype=np.float32), np.array(genders, dtype=np.int32)

def create_dataframe(data_dir):
    """Create DataFrame similar to your example"""
    age = []
    gender = []
    race = []
    img_name = []
    
    files = glob.glob(os.path.join(data_dir, "*.jpg"))
    if not files:  # Try subdirectories
        files = glob.glob(os.path.join(data_dir, "*", "*.jpg"))
    
    for file_path in files:
        file = os.path.basename(file_path)
        f = file.split('_')
        
        if len(f) >= 4:
            try:
                age.append(int(f[0]))
                gender.append(int(f[1]))
                race.append(int(f[2]))
                img_name.append(file)
            except ValueError:
                continue  # Skip files with invalid format
    
    df = pd.DataFrame({
        'age': age,
        'gender': gender, 
        'race': race,
        'img_name': img_name
    })
    
    return df

def decode_img(path):
    """Load and preprocess image"""
    img = tf.io.read_file(path)
    img = tf.io.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (IMG_SIZE, IMG_SIZE))
    img = tf.cast(img, tf.float32) / 255.0
    return img

def augment(img):
    """Data augmentation for training"""
    img = tf.image.random_flip_left_right(img)
    img = tf.image.random_brightness(img, max_delta=0.1)
    img = tf.image.random_contrast(img, 0.9, 1.1)
    img = tf.image.random_saturation(img, 0.9, 1.1)
    return img

def make_ds(paths, ages, genders, training=True):
    """Create TensorFlow dataset"""
    AUTOTUNE = tf.data.AUTOTUNE
    ds = tf.data.Dataset.from_tensor_slices((paths, ages, genders))
    
    def _load(p, a, g):
        img = decode_img(p)
        if training:
            img = augment(img)
        return img, {'age_output': a, 'gender_output': g}
    
    if training:
        ds = ds.shuffle(8192, seed=SEED)
    
    ds = ds.map(_load, num_parallel_calls=AUTOTUNE)
    ds = ds.batch(BATCH_SIZE).prefetch(AUTOTUNE)
    return ds

# ----------------------------
# Model
# ----------------------------
def build_model(input_shape=(IMG_SIZE, IMG_SIZE, 3)):
    """Build multi-output model for age and gender prediction"""
    inputs = tf.keras.Input(shape=input_shape)

    def conv_block(x, filters):
        x = tf.keras.layers.Conv2D(filters, 3, padding="same")(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.ReLU()(x)
        x = tf.keras.layers.Conv2D(filters, 3, padding="same")(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.ReLU()(x)
        x = tf.keras.layers.MaxPool2D()(x)
        return x

    # Shared feature extraction
    x = conv_block(inputs, 32)
    x = conv_block(x, 64) 
    x = conv_block(x, 128)
    x = conv_block(x, 256)
    
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = tf.keras.layers.Dropout(0.3)(x)
    shared_features = tf.keras.layers.Dense(256, activation="relu")(x)
    
    # Age regression head
    age_branch = tf.keras.layers.Dropout(0.2)(shared_features)
    age_branch = tf.keras.layers.Dense(128, activation="relu")(age_branch)
    age_output = tf.keras.layers.Dense(1, activation="relu", name="age_output")(age_branch)
    
    # Gender classification head
    gender_branch = tf.keras.layers.Dropout(0.2)(shared_features)
    gender_branch = tf.keras.layers.Dense(64, activation="relu")(gender_branch)
    gender_output = tf.keras.layers.Dense(1, activation="sigmoid", name="gender_output")(gender_branch)

    model = tf.keras.Model(
        inputs=inputs, 
        outputs=[age_output, gender_output],
        name="age_gender_cnn"
    )
    
    # Compile with different losses for each output
    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-3),
        loss={
            'age_output': 'mae',
            'gender_output': 'binary_crossentropy'
        },
        metrics={
            'age_output': ['mae'],
            'gender_output': ['accuracy']
        },
        loss_weights={
            'age_output': 1.0,
            'gender_output': 1.0
        }
    )
    
    return model

# ----------------------------
# Train
# ----------------------------
if __name__ == "__main__":
    # Create DataFrame like your example
    print("Creating DataFrame...")
    df = create_dataframe(DATA_DIR)
    print(f"DataFrame shape: {df.shape}")
    print("DataFrame head:")
    print(df.head())
    
    # Get paths and labels for training
    paths, ages, genders = list_images_and_labels(DATA_DIR)
    
    if len(paths) == 0:
        raise SystemExit(f"No valid images found in {DATA_DIR}. Ensure UTKFace images are placed there.")
    
    print(f"\nAge distribution: min={ages.min()}, max={ages.max()}, mean={ages.mean():.1f}")
    print(f"Gender distribution: Male (0): {np.sum(genders==0)}, Female (1): {np.sum(genders==1)}")
    
    # Split data
    X_train, X_val, y_age_train, y_age_val, y_gender_train, y_gender_val = train_test_split(
        paths, ages, genders, 
        test_size=0.15, 
        random_state=SEED,
        stratify=genders  # Stratify by gender for balanced split
    )
    
    # Create datasets
    train_ds = make_ds(X_train, y_age_train, y_gender_train, training=True)
    val_ds = make_ds(X_val, y_age_val, y_gender_val, training=False)
    
    # Build and train model
    model = build_model()
    model.summary()
    
    callbacks = [
        tf.keras.callbacks.ModelCheckpoint(
            MODEL_OUT, 
            monitor="val_loss", 
            save_best_only=True, 
            mode="min"
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor="val_loss", 
            factor=0.5, 
            patience=3, 
            mode="min"
        ),
        tf.keras.callbacks.EarlyStopping(
            monitor="val_loss", 
            patience=6, 
            restore_best_weights=True, 
            mode="min"
        ),
    ]
    
    history = model.fit(
        train_ds,
        validation_data=val_ds,
        epochs=EPOCHS,
        callbacks=callbacks
    )
    
    # Final save
    model.save(MODEL_OUT)
    print(f"Training complete. Model saved to {MODEL_OUT}")
    
    # Print final metrics
    print("\nFinal validation metrics:")
    val_results = model.evaluate(val_ds, verbose=0)
    print(f"Overall loss: {val_results[0]:.4f}")
    print(f"Age MAE: {val_results[1]:.2f} years") 
    print(f"Gender accuracy: {val_results[2]:.3f}")

Creating DataFrame...
DataFrame shape: (23705, 4)
DataFrame head:
   age  gender  race                                img_name
0  100       0     0  100_0_0_20170112213500903.jpg.chip.jpg
1  100       0     0  100_0_0_20170112215240346.jpg.chip.jpg
2  100       1     0  100_1_0_20170110183726390.jpg.chip.jpg
3  100       1     0  100_1_0_20170112213001988.jpg.chip.jpg
4  100       1     0  100_1_0_20170112213303693.jpg.chip.jpg
Found 23708 images.
Example image paths: ['\\Users\\Diptanu Sarkar\\Desktop\\Age Detection\\UTKFace\\100_0_0_20170112213500903.jpg.chip.jpg', '\\Users\\Diptanu Sarkar\\Desktop\\Age Detection\\UTKFace\\100_0_0_20170112215240346.jpg.chip.jpg', '\\Users\\Diptanu Sarkar\\Desktop\\Age Detection\\UTKFace\\100_1_0_20170110183726390.jpg.chip.jpg', '\\Users\\Diptanu Sarkar\\Desktop\\Age Detection\\UTKFace\\100_1_0_20170112213001988.jpg.chip.jpg', '\\Users\\Diptanu Sarkar\\Desktop\\Age Detection\\UTKFace\\100_1_0_20170112213303693.jpg.chip.jpg']
Valid images with labels: 

Epoch 1/30
[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6s/step - age_output_loss: 15.0960 - age_output_mae: 15.0960 - gender_output_accuracy: 0.5551 - gender_output_loss: 0.7101 - loss: 15.8061



[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4062s[0m 6s/step - age_output_loss: 13.3628 - age_output_mae: 13.3670 - gender_output_accuracy: 0.5736 - gender_output_loss: 0.6879 - loss: 14.0552 - val_age_output_loss: 14.0672 - val_age_output_mae: 14.0248 - val_gender_output_accuracy: 0.5751 - val_gender_output_loss: 0.6861 - val_loss: 14.7134 - learning_rate: 0.0010
Epoch 2/30
[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7s/step - age_output_loss: 11.4202 - age_output_mae: 11.4202 - gender_output_accuracy: 0.6213 - gender_output_loss: 0.6548 - loss: 12.0750



[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4468s[0m 7s/step - age_output_loss: 11.1581 - age_output_mae: 11.1596 - gender_output_accuracy: 0.6289 - gender_output_loss: 0.6483 - loss: 11.8081 - val_age_output_loss: 12.8155 - val_age_output_mae: 12.8302 - val_gender_output_accuracy: 0.6376 - val_gender_output_loss: 0.6413 - val_loss: 13.4690 - learning_rate: 0.0010
Epoch 3/30
[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4340s[0m 7s/step - age_output_loss: 10.3502 - age_output_mae: 10.3535 - gender_output_accuracy: 0.6390 - gender_output_loss: 0.6369 - loss: 10.9906 - val_age_output_loss: 13.4746 - val_age_output_mae: 13.4120 - val_gender_output_accuracy: 0.5619 - val_gender_output_loss: 0.6814 - val_loss: 14.0903 - learning_rate: 0.0010
Epoch 4/30
[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7s/step - age_output_loss: 9.9454 - age_output_mae: 9.9454 - gender_output_accuracy: 0.6414 - gender_output_loss: 0.6346 - loss: 10.5801



[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4478s[0m 7s/step - age_output_loss: 9.8200 - age_output_mae: 9.8181 - gender_output_accuracy: 0.6378 - gender_output_loss: 0.6355 - loss: 10.4534 - val_age_output_loss: 10.4167 - val_age_output_mae: 10.3842 - val_gender_output_accuracy: 0.6438 - val_gender_output_loss: 0.6427 - val_loss: 11.0266 - learning_rate: 0.0010
Epoch 5/30
[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6s/step - age_output_loss: 9.5704 - age_output_mae: 9.5704 - gender_output_accuracy: 0.6425 - gender_output_loss: 0.6361 - loss: 10.2066



[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3797s[0m 6s/step - age_output_loss: 9.4485 - age_output_mae: 9.4450 - gender_output_accuracy: 0.6441 - gender_output_loss: 0.6335 - loss: 10.0784 - val_age_output_loss: 9.7577 - val_age_output_mae: 9.7768 - val_gender_output_accuracy: 0.6522 - val_gender_output_loss: 0.6421 - val_loss: 10.4184 - learning_rate: 0.0010
Epoch 6/30
[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4063s[0m 6s/step - age_output_loss: 9.0476 - age_output_mae: 9.0539 - gender_output_accuracy: 0.6360 - gender_output_loss: 0.6357 - loss: 9.6896 - val_age_output_loss: 10.8592 - val_age_output_mae: 10.8194 - val_gender_output_accuracy: 0.6151 - val_gender_output_loss: 0.6530 - val_loss: 11.4720 - learning_rate: 0.0010
Epoch 7/30
[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4258s[0m 7s/step - age_output_loss: 8.7395 - age_output_mae: 8.7422 - gender_output_accuracy: 0.6403 - gender_output_loss: 0.6337 - loss: 9.3758 - val_age_outp



[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4381s[0m 7s/step - age_output_loss: 8.4541 - age_output_mae: 8.4523 - gender_output_accuracy: 0.6436 - gender_output_loss: 0.6286 - loss: 9.0811 - val_age_output_loss: 7.7394 - val_age_output_mae: 7.7577 - val_gender_output_accuracy: 0.6666 - val_gender_output_loss: 0.6025 - val_loss: 8.3571 - learning_rate: 0.0010
Epoch 9/30
[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4550s[0m 7s/step - age_output_loss: 8.2202 - age_output_mae: 8.2124 - gender_output_accuracy: 0.6505 - gender_output_loss: 0.6223 - loss: 8.8347 - val_age_output_loss: 10.6556 - val_age_output_mae: 10.6720 - val_gender_output_accuracy: 0.6486 - val_gender_output_loss: 0.6179 - val_loss: 11.2885 - learning_rate: 0.0010
Epoch 10/30
[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4103s[0m 7s/step - age_output_loss: 8.0550 - age_output_mae: 8.0575 - gender_output_accuracy: 0.6618 - gender_output_loss: 0.6119 - loss: 8.6689 - val_age_outpu



[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3793s[0m 6s/step - age_output_loss: 7.1684 - age_output_mae: 7.1602 - gender_output_accuracy: 0.7435 - gender_output_loss: 0.5053 - loss: 7.6657 - val_age_output_loss: 7.6270 - val_age_output_mae: 7.6603 - val_gender_output_accuracy: 0.7341 - val_gender_output_loss: 0.5259 - val_loss: 8.1876 - learning_rate: 5.0000e-04
Epoch 15/30
[1m164/630[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m54:37[0m 7s/step - age_output_loss: 7.1420 - age_output_mae: 7.1420 - gender_output_accuracy: 0.7655 - gender_output_loss: 0.4773 - loss: 7.6192