In [1]:
import kagglehub

# Download latest version

path = kagglehub.dataset_download("moritzm00/utkface-cropped")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'utkface-cropped' dataset.
Path to dataset files: /kaggle/input/utkface-cropped


In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import gradio as gr
from transformers import pipeline
import cv2
import os
from PIL import Image
import io
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Enable GPU memory growth
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [4]:
import os
dataset_dir = '/kaggle/input/utkface-cropped/UTKFace'  # Or '/content/drive/MyDrive/UTKFace' if using Drive
print("Dataset directory exists:", os.path.exists(dataset_dir))
print("Number of files:", len([f for f in os.listdir(dataset_dir) if f.endswith('.jpg')]))

Dataset directory exists: True
Number of files: 23708


In [5]:
# Step 1: Imports and Setup (run if not already done)
import numpy as np
import pandas as pd
import tensorflow as tf  # For later steps, but optional here
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from transformers import pipeline
import cv2
import os
from PIL import Image
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)  # If using TF later

# Enable GPU memory growth (if GPU available in Kaggle)
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

# Dataset path (as confirmed)
dataset_dir = '/kaggle/input/utkface-cropped/UTKFace'
print(f"Processing from: {dataset_dir}")

# Create output directory for hair masks and CSV (Kaggle writable area)
output_base = '/kaggle/working/UTKFace_processed'
os.makedirs(output_base, exist_ok=True)
hair_masks_dir = os.path.join(output_base, 'hair_masks')
os.makedirs(hair_masks_dir, exist_ok=True)
csv_path = os.path.join(output_base, 'utkface_with_hair_length.csv')
print(f"Hair masks will be saved to: {hair_masks_dir}")
print(f"CSV will be saved to: {csv_path}")

# Step 2: Load Pre-trained Hair Length Model
print("Loading hair length model...")
try:
    hair_classifier = pipeline("image-classification", model="Leilab/hair_lenght")
    print("Hair length model loaded successfully.")
except Exception as e:
    print(f"Error loading hair length model: {e}")
    # Fallback: If fails, you can mock predictions or stop
    raise

# Step 3: Define Hair Mask Function (unchanged)
def create_hair_mask(image):
    try:
        img_array = np.array(image)
        img_bgr = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
        img_hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
        lower_hair = np.array([0, 0, 0])  # Dark colors (tune for better accuracy)
        upper_hair = np.array([180, 255, 100])
        mask = cv2.inRange(img_hsv, lower_hair, upper_hair)
        kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5))
        mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel, iterations=2)
        mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel, iterations=2)
        hair_region = cv2.bitwise_and(img_bgr, img_bgr, mask=mask)
        hair_region_rgb = cv2.cvtColor(hair_region, cv2.COLOR_BGR2RGB)
        return hair_region_rgb, mask
    except Exception as e:
        print(f"Error creating hair mask: {e}")
        return None, None

# Step 4: Generate Dataset with Hair Length and Save Masks
print("Starting image processing...")
images = []
ages = []
genders = []
hair_lengths = []
hair_mask_paths = []
max_images = 200  # Start small for testing; increase to 2000 later
valid_files = [f for f in os.listdir(dataset_dir) if f.endswith('.jpg')]
print(f"Found {len(valid_files)} .jpg files. Processing first {max_images}.")

processed_count = 0
for filename in valid_files[:max_images]:
    parts = filename.split('_')
    try:
        age = int(parts[0])
        gender = int(parts[1])  # 0: male, 1: female
        img_path = os.path.join(dataset_dir, filename)
        img = Image.open(img_path).resize((224, 224))
        hair_region, mask = create_hair_mask(img)
        if hair_region is not None and mask is not None:
            # Predict hair length on masked region
            hair_pred = hair_classifier(Image.fromarray(hair_region))[0]['label']
            hair_length = 1 if 'long' in hair_pred.lower() else 0  # Flexible matching

            # Save hair mask as PNG (grayscale binary)
            mask_filename = filename.replace('.jpg', '_hair_mask.png')
            mask_path = os.path.join(hair_masks_dir, mask_filename)
            mask_pil = Image.fromarray(mask.astype(np.uint8))  # Ensure uint8 for saving
            mask_pil.save(mask_path)

            # Collect data (use output_base for paths to keep "same address" feel)
            images.append(img_path)
            ages.append(age)
            genders.append(gender)
            hair_lengths.append(hair_length)
            hair_mask_paths.append(mask_path)
            processed_count += 1
            if processed_count % 50 == 0:  # Progress update
                print(f"Processed {processed_count}/{min(max_images, len(valid_files))}: {filename}")
        else:
            print(f"Skipped {filename}: Failed to create hair mask.")
    except (IndexError, ValueError, Exception) as e:
        print(f"Skipped {filename}: {e}")
        continue

if not images:
    print("Error: No valid images processed.")
    raise ValueError("No valid images loaded.")

# Step 5: Create and Save CSV
new_dataset = pd.DataFrame({
    'image_path': images,
    'age': ages,
    'gender': genders,
    'hair_length': hair_lengths,
    'hair_mask_path': hair_mask_paths
})
new_dataset.to_csv(csv_path, index=False)
print(f"\nNew dataset saved to: {csv_path} (Rows: {len(new_dataset)})")
print("CSV columns: image_path, age, gender, hair_length, hair_mask_path")
print("\nPreview of first 5 rows:")
print(new_dataset.head())

# Optional: Zip outputs for easy download in Kaggle
!zip -r /kaggle/working/UTKFace_hair_masks.zip {hair_masks_dir} {csv_path}
print("\nZipped outputs saved to /kaggle/working/UTKFace_hair_masks.zip – Download from the Output panel!")

Processing from: /kaggle/input/utkface-cropped/UTKFace
Hair masks will be saved to: /kaggle/working/UTKFace_processed/hair_masks
CSV will be saved to: /kaggle/working/UTKFace_processed/utkface_with_hair_length.csv
Loading hair length model...


Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Device set to use cpu


Hair length model loaded successfully.
Starting image processing...
Found 23708 .jpg files. Processing first 200.
Processed 50/200: 72_0_2_20170112205305186.jpg.chip.jpg
Processed 100/200: 70_0_3_20170119203826438.jpg.chip.jpg
Processed 150/200: 5_0_1_20170103205158426.jpg.chip.jpg
Processed 200/200: 80_0_0_20170117175515838.jpg.chip.jpg

New dataset saved to: /kaggle/working/UTKFace_processed/utkface_with_hair_length.csv (Rows: 200)
CSV columns: image_path, age, gender, hair_length, hair_mask_path

Preview of first 5 rows:
                                          image_path  age  gender  \
0  /kaggle/input/utkface-cropped/UTKFace/26_0_2_2...   26       0   
1  /kaggle/input/utkface-cropped/UTKFace/22_1_1_2...   22       1   
2  /kaggle/input/utkface-cropped/UTKFace/21_1_3_2...   21       1   
3  /kaggle/input/utkface-cropped/UTKFace/28_0_0_2...   28       0   
4  /kaggle/input/utkface-cropped/UTKFace/17_1_4_2...   17       1   

   hair_length                                     hair

In [6]:
# Step 6: Preprocess Data for Model (Load Images from CSV)
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import gradio as gr
from transformers import pipeline  # Already loaded, but for completeness
import cv2
import os
from PIL import Image
import warnings
warnings.filterwarnings('ignore')

# Load the CSV we just created
csv_path = '/kaggle/working/UTKFace_processed/utkface_with_hair_length.csv'
new_dataset = pd.read_csv(csv_path)
print(f"Loaded dataset from CSV: {len(new_dataset)} rows")
print("Sample hair lengths:", new_dataset['hair_length'].value_counts())  # Check distribution

def load_and_preprocess_image(path):
    try:
        img = tf.keras.preprocessing.image.load_img(path, target_size=(224, 224))
        img = tf.keras.preprocessing.image.img_to_array(img)
        img = tf.keras.applications.resnet50.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error loading image {path}: {e}")
        return None

# Load images, ages, genders (hair_length not used in training, but available)
X = []
y_age = []
y_gender = []
for path, age, gender in zip(new_dataset['image_path'], new_dataset['age'], new_dataset['gender']):
    img = load_and_preprocess_image(path)
    if img is not None:
        X.append(img)
        y_age.append(age)
        y_gender.append(gender)

X = np.array(X)
y_age = np.array(y_age)
y_gender = np.array(y_gender)

if len(X) == 0:
    print("Error: No images successfully loaded.")
    raise ValueError("No images loaded.")

print(f"Successfully loaded {len(X)} images for training.")

# Split data (80/20)
X_train, X_test, y_age_train, y_age_test, y_gender_train, y_gender_test = train_test_split(
    X, y_age, y_gender, test_size=0.2, random_state=42
)

# Scale age labels (for regression)
age_scaler = StandardScaler()
y_age_train_scaled = age_scaler.fit_transform(y_age_train.reshape(-1, 1)).flatten()
y_age_test_scaled = age_scaler.transform(y_age_test.reshape(-1, 1)).flatten()

print(f"Train set: {len(X_train)} samples | Test set: {len(X_test)} samples")

# Step 7: Build and Train Lightweight ResNet50 Model
print("Building and training model...")
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
x = GlobalAveragePooling2D()(base_model.output)
x = Dense(128, activation='relu')(x)
age_output = Dense(1, name='age')(x)
gender_output = Dense(1, activation='sigmoid', name='gender')(x)
model = Model(inputs=base_model.input, outputs=[age_output, gender_output])

# Freeze base layers for transfer learning
for layer in base_model.layers:
    layer.trainable = False

model.compile(optimizer='adam',
              loss={'age': 'mse', 'gender': 'binary_crossentropy'},
              metrics={'age': 'mae', 'gender': 'accuracy'})

# Train (5 epochs; increase if needed)
history = model.fit(X_train, {'age': y_age_train_scaled, 'gender': y_gender_train},
                    validation_split=0.2, epochs=5, batch_size=32, verbose=1)

# Evaluate
eval_results = model.evaluate(X_test, {'age': y_age_test_scaled, 'gender': y_gender_test}, verbose=0)
print(f"Test Age MAE: {eval_results[3]:.2f} years | Test Gender Accuracy: {eval_results[4]:.2f}")

# Save model
model.save('/kaggle/working/utkface_model.h5')
print("Model saved to /kaggle/working/utkface_model.h5")

# Step 8: Prediction Function for Gradio (Uses hair_classifier from before)
def predict_from_image(image):
    if image is None:
        return "Error: No image provided.", None

    try:
        # Create hair mask (on-the-fly, like training)
        hair_region, hair_mask = create_hair_mask(image)
        if hair_region is None or hair_mask is None:
            return "Error: Failed to create hair mask.", None

        # Preprocess for ResNet
        img_array = tf.keras.preprocessing.image.img_to_array(image.resize((224, 224)))
        img_array = tf.expand_dims(tf.keras.applications.resnet50.preprocess_input(img_array), 0)

        # Predict age and gender
        age_pred_scaled, gender_pred = model.predict(img_array, verbose=0)
        age_pred = age_scaler.inverse_transform(age_pred_scaled)[0][0]
        actual_gender = 'Female' if gender_pred[0][0] >= 0.5 else 'Male'

        # Predict hair length (using loaded hair_classifier)
        hair_pred = hair_classifier(Image.fromarray(hair_region))[0]['label']

        # Task logic: For ages 20-30, adjust gender based on hair
        if 20 <= age_pred <= 30:
            if hair_pred.lower() == 'long':
                predicted_gender = 'Female'
            elif hair_pred.lower() == 'short' and actual_gender == 'Female':
                predicted_gender = 'Male'
            else:
                predicted_gender = actual_gender
        else:
            predicted_gender = actual_gender

        # Convert mask to PIL for display
        mask_pil = Image.fromarray(hair_mask)
        return (f"Extracted Age: {int(age_pred)}\n"
                f"Extracted Hair Length: {hair_pred}\n"
                f"Extracted Actual Gender: {actual_gender}\n"
                f"Final Predicted Gender (with rules): {predicted_gender}"), mask_pil
    except Exception as e:
        return f"Error processing image: {e}", None


Loaded dataset from CSV: 200 rows
Sample hair lengths: hair_length
0    200
Name: count, dtype: int64
Successfully loaded 200 images for training.
Train set: 160 samples | Test set: 40 samples
Building and training model...
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 0us/step
Epoch 1/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 8s/step - age_loss: 3.1757 - age_mae: 1.4562 - gender_accuracy: 0.4313 - gender_loss: 0.7873 - loss: 3.9631 - val_age_loss: 0.9568 - val_age_mae: 0.8128 - val_gender_accuracy: 0.6562 - val_gender_loss: 0.6195 - val_loss: 1.5763
Epoch 2/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 6s/step - age_loss: 1.9507 - age_mae: 1.1580 - gender_accuracy: 0.6687 - gender_loss: 0.6164 - loss: 2.5671 - val_age_loss: 1.2790 - val_age_mae: 1.0077 - val_gender_accuracy



Test Age MAE: 0.80 years | Test Gender Accuracy: 0.75
Model saved to /kaggle/working/utkface_model.h5


In [7]:
from google.colab import files
files.download("/kaggle/working/utkface_model.h5")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Diagnosis Code: Visualize a Sample and Re-Predict Hair Length
import matplotlib.pyplot as plt
from PIL import Image
import cv2
import numpy as np
from transformers import pipeline  # hair_classifier should be loaded from before

# Use first sample from your preview (adjust path if needed)
sample_image_path = '/kaggle/input/utkface-cropped/UTKFace/26_0_2_20170104023102422.jpg.chip.jpg'  # From preview
sample_mask_path = '/kaggle/working/UTKFace_processed/hair_masks/26_0_2_20170104023102422_hair_mask.png.chip_hair_mask.png'

# Load original and mask
original_img = Image.open(sample_image_path)
mask_img = Image.open(sample_mask_path)

# Create hair region from mask (for prediction)
mask_array = np.array(mask_img)
img_bgr = cv2.cvtColor(np.array(original_img), cv2.COLOR_RGB2BGR)
hair_region_bgr = cv2.bitwise_and(img_bgr, img_bgr, mask=mask_array)
hair_region_rgb = cv2.cvtColor(hair_region_bgr, cv2.COLOR_BGR2RGB)
hair_pil = Image.fromarray(hair_region_rgb)

# Re-predict with model
try:
    hair_pred_full = hair_classifier(hair_pil)[0]
    print(f"Raw Prediction: {hair_pred_full['label']} (confidence: {hair_pred_full['score']:.2f})")
    hair_length = 1 if 'long' in hair_pred_full['label'].lower() else 0
    print(f"Computed hair_length: {hair_length}")
except Exception as e:
    print(f"Prediction error: {e}")

# Visualize
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
axes[0].imshow(original_img)
axes[0].set_title('Original Image')
axes[1].imshow(mask_img, cmap='gray')
axes[1].set_title('Hair Mask (Binary)')
axes[2].imshow(hair_pil)
axes[2].set_title('Hair Region (Masked)')
plt.show()

In [None]:
def create_hair_mask(image):
    try:
        img_array = np.array(image)
        img_bgr = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
        img_hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)

        # Multiple ranges: Dark hair + Brown + Blonde/Light
        lower_dark = np.array([0, 0, 0])
        upper_dark = np.array([180, 255, 50])  # Lower value threshold for darker
        mask_dark = cv2.inRange(img_hsv, lower_dark, upper_dark)

        lower_brown = np.array([10, 50, 50])  # Brown tones
        upper_brown = np.array([25, 200, 150])
        mask_brown = cv2.inRange(img_hsv, lower_brown, upper_brown)

        lower_blonde = np.array([20, 30, 100])  # Blonde/light
        upper_blonde = np.array([35, 150, 255])
        mask_blonde = cv2.inRange(img_hsv, lower_blonde, upper_blonde)

        # Combine masks
        mask = mask_dark | mask_brown | mask_blonde

        kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5))
        mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel, iterations=2)
        mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel, iterations=2)

        hair_region = cv2.bitwise_and(img_bgr, img_bgr, mask=mask)
        hair_region_rgb = cv2.cvtColor(hair_region, cv2.COLOR_BGR2RGB)
        return hair_region_rgb, mask
    except Exception as e:
        print(f"Error creating hair mask: {e}")
        return None, None

In [8]:
# Full Code for Hair Model: Hair Mask Generation and Hair Length Prediction
# This script loads the UTKFace dataset, generates hair masks using improved HSV ranges,
# predicts hair length using a pre-trained Hugging Face model, saves masks as PNGs,
# and creates a CSV with image paths, age, gender, hair_length, and hair_mask_paths.
# Environment: Kaggle Notebook (adjust paths for Colab if needed).

import numpy as np
import pandas as pd
import tensorflow as tf  # Optional, for seeds
from transformers import pipeline
import cv2
import os
from PIL import Image
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Dataset path (Kaggle-specific; change to '/content/UTKFace' in Colab)
dataset_dir = '/kaggle/input/utkface-cropped/UTKFace'
print(f"Processing from: {dataset_dir}")

# Create output directory for hair masks and CSV (writable in Kaggle)
output_base = '/kaggle/working/UTKFace_processed'
os.makedirs(output_base, exist_ok=True)
hair_masks_dir = os.path.join(output_base, 'hair_masks')
os.makedirs(hair_masks_dir, exist_ok=True)
csv_path = os.path.join(output_base, 'utkface_with_hair_length.csv')
print(f"Hair masks will be saved to: {hair_masks_dir}")
print(f"CSV will be saved to: {csv_path}")

# Load Pre-trained Hair Length Model
print("Loading hair length model...")
try:
    hair_classifier = pipeline("image-classification", model="Leilab/hair_lenght")
    print("Hair length model loaded successfully.")
except Exception as e:
    print(f"Error loading hair length model: {e}")
    raise

# Improved Hair Mask Function (with multiple HSV ranges for diverse hair colors)
def create_hair_mask(image):
    try:
        img_array = np.array(image)
        img_bgr = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
        img_hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)

        # Multiple ranges: Dark hair + Brown + Blonde/Light
        lower_dark = np.array([0, 0, 0])
        upper_dark = np.array([180, 255, 50])  # Lower value for darker tones
        mask_dark = cv2.inRange(img_hsv, lower_dark, upper_dark)

        lower_brown = np.array([10, 50, 50])  # Brown tones
        upper_brown = np.array([25, 200, 150])
        mask_brown = cv2.inRange(img_hsv, lower_brown, upper_brown)

        lower_blonde = np.array([20, 30, 100])  # Blonde/light
        upper_blonde = np.array([35, 150, 255])
        mask_blonde = cv2.inRange(img_hsv, lower_blonde, upper_blonde)

        # Combine masks
        mask = mask_dark | mask_brown | mask_blonde

        kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5))
        mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel, iterations=2)
        mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel, iterations=2)

        hair_region = cv2.bitwise_and(img_bgr, img_bgr, mask=mask)
        hair_region_rgb = cv2.cvtColor(hair_region, cv2.COLOR_BGR2RGB)
        return hair_region_rgb, mask
    except Exception as e:
        print(f"Error creating hair mask: {e}")
        return None, None

# Generate Dataset with Hair Length and Save Masks
print("Starting image processing...")
images = []
ages = []
genders = []
hair_lengths = []
hair_mask_paths = []
max_images = 200  # Increase to 2000 for full run; start small for testing
valid_files = [f for f in os.listdir(dataset_dir) if f.endswith('.jpg')]
print(f"Found {len(valid_files)} .jpg files. Processing first {max_images}.")

processed_count = 0
for filename in valid_files[:max_images]:
    parts = filename.split('_')
    try:
        age = int(parts[0])
        gender = int(parts[1])  # 0: male, 1: female
        img_path = os.path.join(dataset_dir, filename)
        img = Image.open(img_path).resize((224, 224))
        hair_region, mask = create_hair_mask(img)
        if hair_region is not None and mask is not None:
            # Predict hair length on masked region
            hair_pred = hair_classifier(Image.fromarray(hair_region))[0]
            print(f"Raw label for {filename}: {hair_pred['label']} (score: {hair_pred['score']:.2f})")  # Debug raw output
            hair_length = 1 if 'long' in hair_pred['label'].lower() else 0

            # Save hair mask as PNG
            mask_filename = filename.replace('.jpg', '_hair_mask.png')
            mask_path = os.path.join(hair_masks_dir, mask_filename)
            mask_pil = Image.fromarray(mask.astype(np.uint8))
            mask_pil.save(mask_path)

            # Collect data
            images.append(img_path)
            ages.append(age)
            genders.append(gender)
            hair_lengths.append(hair_length)
            hair_mask_paths.append(mask_path)
            processed_count += 1
            if processed_count % 50 == 0:
                print(f"Processed {processed_count}/{min(max_images, len(valid_files))}: {filename}")
        else:
            print(f"Skipped {filename}: Failed to create hair mask.")
    except (IndexError, ValueError, Exception) as e:
        print(f"Skipped {filename}: {e}")
        continue

if not images:
    print("Error: No valid images processed.")
    raise ValueError("No valid images loaded.")

# Create and Save CSV
new_dataset = pd.DataFrame({
    'image_path': images,
    'age': ages,
    'gender': genders,
    'hair_length': hair_lengths,
    'hair_mask_path': hair_mask_paths
})
new_dataset.to_csv(csv_path, index=False)
print(f"\nNew dataset saved to: {csv_path} (Rows: {len(new_dataset)})")
print("Hair length distribution:")
print(new_dataset['hair_length'].value_counts())
print("\nPreview of first 5 rows:")
print(new_dataset.head())

# Optional: Zip outputs for download
!zip -r /kaggle/working/UTKFace_hair_masks.zip {hair_masks_dir} {csv_path}
print("\nZipped outputs saved to /kaggle/working/UTKFace_hair_masks.zip – Download from the Output panel!")

Processing from: /kaggle/input/utkface-cropped/UTKFace
Hair masks will be saved to: /kaggle/working/UTKFace_processed/hair_masks
CSV will be saved to: /kaggle/working/UTKFace_processed/utkface_with_hair_length.csv
Loading hair length model...


Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cpu


Hair length model loaded successfully.
Starting image processing...
Found 23708 .jpg files. Processing first 200.
Raw label for 26_0_2_20170104023102422.jpg.chip.jpg: short human hair (score: 0.78)
Raw label for 22_1_1_20170112233644761.jpg.chip.jpg: short human hair (score: 0.69)
Raw label for 21_1_3_20170105003215901.jpg.chip.jpg: short human hair (score: 0.74)
Raw label for 28_0_0_20170117180555824.jpg.chip.jpg: short human hair (score: 0.72)
Raw label for 17_1_4_20170103222931966.jpg.chip.jpg: short human hair (score: 0.73)
Raw label for 44_0_3_20170119201022260.jpg.chip.jpg: short human hair (score: 0.64)
Raw label for 35_0_2_20170116182734834.jpg.chip.jpg: short human hair (score: 0.64)
Raw label for 76_0_0_20170104213515132.jpg.chip.jpg: short human hair (score: 0.59)
Raw label for 36_1_0_20170116165722892.jpg.chip.jpg: short human hair (score: 0.68)
Raw label for 34_0_3_20170119200815948.jpg.chip.jpg: short human hair (score: 0.68)
Raw label for 18_1_0_20170104022856102.jpg.chi

In [9]:
# Full Code for Hair Model: Hair Mask Generation and Hair Length Prediction
# This script loads the UTKFace dataset, generates hair masks using improved HSV ranges,
# predicts hair length using a pre-trained Hugging Face model, saves masks as PNGs,
# and creates a CSV with image paths, age, gender, hair_length, and hair_mask_paths.
# Environment: Kaggle Notebook (adjust paths for Colab if needed).

import numpy as np
import pandas as pd
import tensorflow as tf  # Optional, for seeds
from transformers import pipeline
import cv2
import os
from PIL import Image
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Dataset path (Kaggle-specific; change to '/content/UTKFace' in Colab)
dataset_dir = '/kaggle/input/utkface-cropped/UTKFace'
print(f"Processing from: {dataset_dir}")

# Create output directory for hair masks and CSV (writable in Kaggle)
output_base = '/kaggle/working/UTKFace_processed'
os.makedirs(output_base, exist_ok=True)
hair_masks_dir = os.path.join(output_base, 'hair_masks')
os.makedirs(hair_masks_dir, exist_ok=True)
csv_path = os.path.join(output_base, 'utkface_with_hair_length.csv')
print(f"Hair masks will be saved to: {hair_masks_dir}")
print(f"CSV will be saved to: {csv_path}")

# Load Pre-trained Hair Length Model
print("Loading hair length model...")
try:
    hair_classifier = pipeline("image-classification", model="Leilab/hair_lenght")
    print("Hair length model loaded successfully.")
except Exception as e:
    print(f"Error loading hair length model: {e}")
    raise

# Improved Hair Mask Function (with multiple HSV ranges for diverse hair colors)
def create_hair_mask(image):
    try:
        img_array = np.array(image)
        img_bgr = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
        img_hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)

        # Multiple ranges: Dark hair + Brown + Blonde/Light
        lower_dark = np.array([0, 0, 0])
        upper_dark = np.array([180, 255, 50])  # Lower value for darker tones
        mask_dark = cv2.inRange(img_hsv, lower_dark, upper_dark)

        lower_brown = np.array([10, 50, 50])  # Brown tones
        upper_brown = np.array([25, 200, 150])
        mask_brown = cv2.inRange(img_hsv, lower_brown, upper_brown)

        lower_blonde = np.array([20, 30, 100])  # Blonde/light
        upper_blonde = np.array([35, 150, 255])
        mask_blonde = cv2.inRange(img_hsv, lower_blonde, upper_blonde)

        # Combine masks
        mask = mask_dark | mask_brown | mask_blonde

        kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5))
        mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel, iterations=2)
        mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel, iterations=2)

        hair_region = cv2.bitwise_and(img_bgr, img_bgr, mask=mask)
        hair_region_rgb = cv2.cvtColor(hair_region, cv2.COLOR_BGR2RGB)
        return hair_region_rgb, mask
    except Exception as e:
        print(f"Error creating hair mask: {e}")
        return None, None

# Generate Dataset with Hair Length and Save Masks
print("Starting image processing...")
images = []
ages = []
genders = []
hair_lengths = []
hair_mask_paths = []
max_images = 200  # Increase to 2000 for full run; start small for testing
valid_files = [f for f in os.listdir(dataset_dir) if f.endswith('.jpg')]
print(f"Found {len(valid_files)} .jpg files. Processing first {max_images}.")

processed_count = 0
for filename in valid_files[:max_images]:
    parts = filename.split('_')
    try:
        age = int(parts[0])
        gender = int(parts[1])  # 0: male, 1: female
        img_path = os.path.join(dataset_dir, filename)
        img = Image.open(img_path).resize((224, 224))
        hair_region, mask = create_hair_mask(img)
        if hair_region is not None and mask is not None:
            # Predict hair length on masked region
            hair_pred = hair_classifier(Image.fromarray(hair_region))[0]
            print(f"Raw label for {filename}: {hair_pred['label']} (score: {hair_pred['score']:.2f})")  # Debug raw output
            hair_length = 1 if 'long' in hair_pred['label'].lower() else 0

            # Save hair mask as PNG
            mask_filename = filename.replace('.jpg', '_hair_mask.png')
            mask_path = os.path.join(hair_masks_dir, mask_filename)
            mask_pil = Image.fromarray(mask.astype(np.uint8))
            mask_pil.save(mask_path)

            # Collect data
            images.append(img_path)
            ages.append(age)
            genders.append(gender)
            hair_lengths.append(hair_length)
            hair_mask_paths.append(mask_path)
            processed_count += 1
            if processed_count % 50 == 0:
                print(f"Processed {processed_count}/{min(max_images, len(valid_files))}: {filename}")
        else:
            print(f"Skipped {filename}: Failed to create hair mask.")
    except (IndexError, ValueError, Exception) as e:
        print(f"Skipped {filename}: {e}")
        continue

if not images:
    print("Error: No valid images processed.")
    raise ValueError("No valid images loaded.")

# Create and Save CSV
new_dataset = pd.DataFrame({
    'image_path': images,
    'age': ages,
    'gender': genders,
    'hair_length': hair_lengths,
    'hair_mask_path': hair_mask_paths
})
new_dataset.to_csv(csv_path, index=False)
print(f"\nNew dataset saved to: {csv_path} (Rows: {len(new_dataset)})")
print("Hair length distribution:")
print(new_dataset['hair_length'].value_counts())
print("\nPreview of first 5 rows:")
print(new_dataset.head())

# Optional: Zip outputs for download
!zip -r /kaggle/working/UTKFace_hair_masks.zip {hair_masks_dir} {csv_path}
print("\nZipped outputs saved to /kaggle/working/UTKFace_hair_masks.zip – Download from the Output panel!")

Processing from: /kaggle/input/utkface-cropped/UTKFace
Hair masks will be saved to: /kaggle/working/UTKFace_processed/hair_masks
CSV will be saved to: /kaggle/working/UTKFace_processed/utkface_with_hair_length.csv
Loading hair length model...


Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cpu


Hair length model loaded successfully.
Starting image processing...
Found 23708 .jpg files. Processing first 200.
Raw label for 26_0_2_20170104023102422.jpg.chip.jpg: short human hair (score: 0.78)
Raw label for 22_1_1_20170112233644761.jpg.chip.jpg: short human hair (score: 0.69)
Raw label for 21_1_3_20170105003215901.jpg.chip.jpg: short human hair (score: 0.74)
Raw label for 28_0_0_20170117180555824.jpg.chip.jpg: short human hair (score: 0.72)
Raw label for 17_1_4_20170103222931966.jpg.chip.jpg: short human hair (score: 0.73)
Raw label for 44_0_3_20170119201022260.jpg.chip.jpg: short human hair (score: 0.64)
Raw label for 35_0_2_20170116182734834.jpg.chip.jpg: short human hair (score: 0.64)
Raw label for 76_0_0_20170104213515132.jpg.chip.jpg: short human hair (score: 0.59)
Raw label for 36_1_0_20170116165722892.jpg.chip.jpg: short human hair (score: 0.68)
Raw label for 34_0_3_20170119200815948.jpg.chip.jpg: short human hair (score: 0.68)
Raw label for 18_1_0_20170104022856102.jpg.chi

In [10]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
import cv2
import os
import random
from PIL import Image
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)
random.seed(42)

# Dataset and output paths
dataset_dir = '/kaggle/input/utkface-cropped/UTKFace'
output_base = '/kaggle/working/UTKFace_processed'
hair_masks_dir = os.path.join(output_base, 'hair_masks')
model_path = os.path.join(output_base, 'hair_length_model.h5')
csv_path = os.path.join(output_base, 'utkface_with_hair_length.csv')
os.makedirs(output_base, exist_ok=True)
os.makedirs(hair_masks_dir, exist_ok=True)
print(f"Dataset: {dataset_dir}")
print(f"Hair masks: {hair_masks_dir}")
print(f"Model will be saved to: {model_path}")
print(f"CSV will be saved to: {csv_path}")

# Load Haar Cascade for face detection
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

# Improved Hair Mask Function with Face Detection
def create_hair_mask(image):
    try:
        img_array = np.array(image)
        img_bgr = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
        img_hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)

        # Multiple HSV ranges for diverse hair colors
        lower_dark = np.array([0, 0, 0])
        upper_dark = np.array([180, 255, 50])  # Dark hair
        mask_dark = cv2.inRange(img_hsv, lower_dark, upper_dark)

        lower_brown = np.array([10, 50, 50])  # Brown tones
        upper_brown = np.array([25, 200, 200])
        mask_brown = cv2.inRange(img_hsv, lower_brown, upper_brown)

        lower_blonde = np.array([20, 30, 100])  # Blonde/light
        upper_blonde = np.array([35, 150, 255])
        mask_blonde = cv2.inRange(img_hsv, lower_blonde, upper_blonde)

        # Combine masks
        mask = mask_dark | mask_brown | mask_blonde

        # Face detection to focus on head region
        gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
        faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
        if len(faces) > 0:
            x, y, w, h = faces[0]
            y_top = max(0, y - int(h * 0.5))
            y_bottom = min(img_bgr.shape[0], y + h)
            x_left = max(0, x - int(w * 0.2))
            x_right = min(img_bgr.shape[1], x + w + int(w * 0.2))
            mask[0:y_top, :] = 0
            mask[y_bottom:, :] = 0
            mask[:, 0:x_left] = 0
            mask[:, x_right:] = 0

        # Morphological operations
        kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5))
        mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel, iterations=2)
        mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel, iterations=2)

        # Create hair region (cropped to top 70%)
        hair_region = cv2.bitwise_and(img_bgr, img_bgr, mask=mask)
        hair_region_rgb = cv2.cvtColor(hair_region, cv2.COLOR_BGR2RGB)
        height = hair_region_rgb.shape[0]
        hair_region_cropped = hair_region_rgb[:int(0.7 * height), :]

        # Pseudo-label: Long hair if mask extends >30% of image height
        mask_height = np.sum(mask, axis=1)
        non_zero_rows = np.where(mask_height > 0)[0]
        hair_extent = (non_zero_rows[-1] - non_zero_rows[0]) / mask.shape[0] if len(non_zero_rows) > 0 else 0
        hair_length = 1 if hair_extent > 0.3 else 0

        return hair_region_cropped, mask, hair_length
    except Exception as e:
        print(f"Error creating hair mask: {e}")
        return None, None, None

# Visualize a sample for debugging
def visualize_sample(image_path, hair_region, mask, filename, hair_length):
    original_img = Image.open(image_path)
    mask_pil = Image.fromarray(mask)
    hair_pil = Image.fromarray(hair_region)
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    axes[0].imshow(original_img)
    axes[0].set_title('Original Image')
    axes[1].imshow(mask_pil, cmap='gray')
    axes[1].set_title(f'Hair Mask (Label: {"Long" if hair_length else "Short"})')
    axes[2].imshow(hair_pil)
    axes[2].set_title('Hair Region (Cropped)')
    plt.savefig(os.path.join(hair_masks_dir, f"{filename}_debug.png"))
    plt.close()
    print(f"Debug visualization saved for {filename} (Hair Length: {'Long' if hair_length else 'Short'})")

# Build CNN Model
def build_cnn_model():
    model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)),
        MaxPooling2D((2, 2)),
        Conv2D(64, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Conv2D(128, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Flatten(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')  # Binary output (short=0, long=1)
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Prepare Dataset
print("Preparing dataset...")
images = []
ages = []
genders = []
hair_lengths = []
hair_mask_paths = []
image_arrays = []
max_images = 200
valid_files = [f for f in os.listdir(dataset_dir) if f.endswith('.jpg')]
valid_files = random.sample(valid_files, min(max_images, len(valid_files)))
print(f"Found {len(valid_files)} .jpg files. Processing {max_images} images.")

processed_count = 0
for filename in valid_files:
    parts = filename.split('_')
    try:
        age = int(parts[0])
        gender = int(parts[1])
        img_path = os.path.join(dataset_dir, filename)
        img = Image.open(img_path).resize((224, 224))
        hair_region, mask, hair_length = create_hair_mask(img)
        if hair_region is not None and mask is not None:
            # Save hair mask
            mask_filename = filename.replace('.jpg', '_hair_mask.png')
            mask_path = os.path.join(hair_masks_dir, mask_filename)
            mask_pil = Image.fromarray(mask.astype(np.uint8))
            mask_pil.save(mask_path)

            # Debug visualization for first 5 images
            if processed_count < 5:
                visualize_sample(img_path, hair_region, mask, filename, hair_length)

            # Collect data
            images.append(img_path)
            ages.append(age)
            genders.append(gender)
            hair_lengths.append(hair_length)
            hair_mask_paths.append(mask_path)
            image_arrays.append(np.array(img))  # Full image for training
            processed_count += 1
            if processed_count % 50 == 0:
                print(f"Processed {processed_count}/{max_images}: {filename}")
        else:
            print(f"Skipped {filename}: Failed to create hair mask.")
    except (IndexError, ValueError, Exception) as e:
        print(f"Skipped {filename}: {e}")
        continue

if not images:
    print("Error: No valid images processed.")
    raise ValueError("No valid images loaded.")

# Convert to arrays for training
X = np.array(image_arrays) / 255.0  # Normalize images
y = np.array(hair_lengths)
print(f"\nDataset prepared: {len(X)} images")
print("Hair length distribution:")
print(pd.Series(y).value_counts())

# Train-Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Training set: {len(X_train)} images")
print(f"Test set: {len(X_test)} images")

# Build and Train Model
print("\nBuilding and training model...")
model = build_cnn_model()
model.summary()
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test), verbose=1)

# Evaluate Model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"\nTest Accuracy: {test_accuracy:.4f}, Test Loss: {test_loss:.4f}")

# Save Model
model.save(model_path)
print(f"Model saved to: {model_path}")

# Save Predictions and Dataset
predictions = (model.predict(X) > 0.5).astype(int).flatten()
new_dataset = pd.DataFrame({
    'image_path': images,
    'age': ages,
    'gender': genders,
    'hair_length': predictions,
    'hair_mask_path': hair_mask_paths
})
new_dataset.to_csv(csv_path, index=False)
print(f"\nDataset saved to: {csv_path} (Rows: {len(new_dataset)})")
print("Predicted hair length distribution:")
print(new_dataset['hair_length'].value_counts())

# Plot Training History
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Val Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.savefig(os.path.join(output_base, 'training_history.png'))
plt.close()
print(f"Training history plot saved to: {os.path.join(output_base, 'training_history.png')}")

# Zip Outputs
!zip -r /kaggle/working/UTKFace_hair_model_outputs.zip {output_base}
print("\nZipped outputs (model, masks, CSV, plots) saved to /kaggle/working/UTKFace_hair_model_outputs.zip")
print("Download from Kaggle's Output panel!")

Dataset: /kaggle/input/utkface-cropped/UTKFace
Hair masks: /kaggle/working/UTKFace_processed/hair_masks
Model will be saved to: /kaggle/working/UTKFace_processed/hair_length_model.h5
CSV will be saved to: /kaggle/working/UTKFace_processed/utkface_with_hair_length.csv
Preparing dataset...
Found 200 .jpg files. Processing 200 images.
Debug visualization saved for 42_0_0_20170117154729044.jpg.chip.jpg (Hair Length: Short)
Debug visualization saved for 55_0_0_20170117170523803.jpg.chip.jpg (Hair Length: Long)
Debug visualization saved for 52_0_0_20170104204156747.jpg.chip.jpg (Hair Length: Long)
Debug visualization saved for 38_0_1_20170113190540449.jpg.chip.jpg (Hair Length: Long)
Debug visualization saved for 23_0_1_20170113150913751.jpg.chip.jpg (Hair Length: Long)
Processed 50/200: 46_1_0_20170109013450138.jpg.chip.jpg
Processed 100/200: 35_1_3_20170119152835120.jpg.chip.jpg
Processed 150/200: 32_1_1_20170116161336538.jpg.chip.jpg
Processed 200/200: 32_1_3_20170109134532682.jpg.chip.jp

Epoch 1/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 4s/step - accuracy: 0.6991 - loss: 0.8581 - val_accuracy: 0.9500 - val_loss: 0.2890
Epoch 2/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 5s/step - accuracy: 0.9578 - loss: 0.2485 - val_accuracy: 0.9500 - val_loss: 0.2152
Epoch 3/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 3s/step - accuracy: 0.9578 - loss: 0.2317 - val_accuracy: 0.9500 - val_loss: 0.2052
Epoch 4/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 3s/step - accuracy: 0.9578 - loss: 0.1794 - val_accuracy: 0.9500 - val_loss: 0.1935
Epoch 5/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 3s/step - accuracy: 0.9578 - loss: 0.1991 - val_accuracy: 0.9500 - val_loss: 0.1740
Epoch 6/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 3s/step - accuracy: 0.9578 - loss: 0.1702 - val_accuracy: 0.9500 - val_loss: 0.1553
Epoch 7/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m




Test Accuracy: 0.9500, Test Loss: 0.1486
Model saved to: /kaggle/working/UTKFace_processed/hair_length_model.h5
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 700ms/step

Dataset saved to: /kaggle/working/UTKFace_processed/utkface_with_hair_length.csv (Rows: 200)
Predicted hair length distribution:
hair_length
1    200
Name: count, dtype: int64
Training history plot saved to: /kaggle/working/UTKFace_processed/training_history.png
  adding: kaggle/working/UTKFace_processed/ (stored 0%)
  adding: kaggle/working/UTKFace_processed/hair_length_model.h5 (deflated 21%)
  adding: kaggle/working/UTKFace_processed/training_history.png (deflated 11%)
  adding: kaggle/working/UTKFace_processed/utkface_with_hair_length.csv (deflated 91%)
  adding: kaggle/working/UTKFace_processed/hair_masks/ (stored 0%)
  adding: kaggle/working/UTKFace_processed/hair_masks/58_0_3_20170119211659305_hair_mask.png.chip_hair_mask.png (stored 0%)
  adding: kaggle/working/UTKFace_processed/hair_masks/28_0

In [11]:
from google.colab import files
files.download("/kaggle/working/UTKFace_processed/hair_length_model.h5")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>