In [None]:
# In a notebook cell
!pip install -r requirements.txt

In [1]:
from datasets import load_dataset

# Load the dataset from Hugging Face
dataset = load_dataset("akahana/Driver-Drowsiness-Dataset")  # ❗ download combined train/test splits
print(dataset)  # Display splits and columns¹
dataset['train'][0]['image'].show()  # Preview first image from train split
print(f"Train samples: {len(dataset['train'])}")  # Report train size⁸
print(f"Test samples: {len(dataset['test'])}")   # Report test size⁸

# Check label distribution
from collections import Counter
train_labels = [item['label'] for item in dataset['train']]
label_counts = Counter(train_labels)
print(f"Label distribution: {label_counts}")  # Expect something like {0:..., 1:...}⁸


  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 33434
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 8359
    })
})
Train samples: 33434
Test samples: 8359
Label distribution: Counter({0: 17868, 1: 15566})


In [None]:
import os
from pathlib import Path
from sklearn.model_selection import train_test_split
from collections import Counter

def prepare_createml_dataset(dataset, output_dir="createml_data"):
    """
    Create CreateML dataset structure and save images for both train and test.
    Expects 2 classes: 0 (awake) and 1 (drowsy)
    Creates structure: output_dir/train/awake, output_dir/train/drowsy, 
                      output_dir/test/awake, output_dir/test/drowsy
    """
    print("🚀 Starting CreateML dataset preparation...")
    os.makedirs(output_dir, exist_ok=True)
    
    # Process both train and test splits
    splits = {
        'train': dataset['train'],
        'test': dataset['test']
    }
    
    # Create folder structure for both train and test
    for split_name in ['train', 'test']:
        for label_value in [0, 1]:  # 0=awake, 1=drowsy
            label_name = 'awake' if label_value == 0 else 'drowsy'
            dir_path = Path(output_dir) / split_name / label_name
            dir_path.mkdir(parents=True, exist_ok=True)
            print(f"📁 Created directory: {dir_path}")
    
    total_saved = 0
    
    # Process each split (train and test)
    for split_name, split_data in splits.items():
        split_images = [item['image'] for item in split_data]
        split_labels = [item['label'] for item in split_data]
        
        # Save images for this split
        for idx, (image, label) in enumerate(zip(split_images, split_labels)):
            label_name = 'awake' if label == 0 else 'drowsy'
            filename = f"{split_name}_{label_name}_{idx:05d}.jpg"
            filepath = Path(output_dir) / split_name / label_name / filename
            image.save(str(filepath), 'JPEG', quality=95)
            total_saved += 1
            if total_saved % 1000 == 0:
                print(f"💾 Saved {total_saved} images...")
    
    print(f"✅ Dataset preparation complete!")
    print(f"📊 Total images saved: {total_saved}")
    print(f"📂 Dataset location: {output_dir}")
    
    # Final statistics for both splits
    for split_name, split_data in splits.items():
        split_labels = [item['label'] for item in split_data]
        label_counts = Counter(split_labels)
        awake_count = label_counts[0]
        drowsy_count = label_counts[1]
        print(f"{split_name.capitalize()}: {len(split_labels)} images "
              f"(Awake: {awake_count}, Drowsy: {drowsy_count})")
    
    return output_dir

# Usage
dataset_path = prepare_createml_dataset(dataset)


🚀 Starting CreateML dataset preparation...
📁 Created directory: createml_data/train/awake
📁 Created directory: createml_data/train/drowsy
💾 Saved 1000 images...
💾 Saved 2000 images...
💾 Saved 3000 images...
💾 Saved 4000 images...
💾 Saved 5000 images...
💾 Saved 6000 images...
💾 Saved 7000 images...
💾 Saved 8000 images...
💾 Saved 9000 images...
💾 Saved 10000 images...
💾 Saved 11000 images...
💾 Saved 12000 images...
💾 Saved 13000 images...
💾 Saved 14000 images...
💾 Saved 15000 images...
💾 Saved 16000 images...
💾 Saved 17000 images...
💾 Saved 18000 images...
💾 Saved 19000 images...
💾 Saved 20000 images...
💾 Saved 21000 images...
💾 Saved 22000 images...
💾 Saved 23000 images...
💾 Saved 24000 images...
💾 Saved 25000 images...
💾 Saved 26000 images...
💾 Saved 27000 images...
💾 Saved 28000 images...
💾 Saved 29000 images...
💾 Saved 30000 images...
💾 Saved 31000 images...
💾 Saved 32000 images...
💾 Saved 33000 images...
✅ Dataset preparation complete!
📊 Total images saved: 33434
📂 Dataset location:

In [3]:
import cv2
import mediapipe as mp
import gc
from datasets import load_dataset

# Initialize MediaPipe FaceMesh ONCE globally - not inside the function!
mp_face_mesh = mp.solutions.face_mesh
face_mesh = mp_face_mesh.FaceMesh(
    static_image_mode=True,
    max_num_faces=1,
    refine_landmarks=True,
    min_detection_confidence=0.5
)

def extract_landmarks(image_path):
    """
    Extract 468 facial landmarks from a single image using MediaPipe Face Mesh.
    Returns a list of dicts [{'x':..., 'y':..., 'z':...}, ...] or None if no face detected.
    """
    try:
        image = cv2.imread(image_path)
        if image is None:
            print(f"⚠️ Could not read image: {image_path}")
            return None
        
        rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        results = face_mesh.process(rgb_image)  # Use the global face_mesh instance
        
        if results.multi_face_landmarks:
            landmarks = []
            for lm in results.multi_face_landmarks[0].landmark:
                landmarks.append({'x': lm.x, 'y': lm.y, 'z': lm.z})
            
            # Clean up memory after processing each image
            del image, rgb_image, results
            gc.collect()
            
            return landmarks
        
        # Clean up even when no face detected
        del image, rgb_image, results
        gc.collect()
        return None
        
    except Exception as e:
        print(f"❌ Error processing {image_path}: {e}")
        gc.collect()  # Clean up on error too
        return None

I0000 00:00:1749084788.675120 3310869 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.4), renderer: Apple M4 Pro


In [4]:
import json
import pandas as pd
import numpy as np
import os
import concurrent.futures
import time
from typing import List, Dict

class LandmarkProcessor:
    """
    Wraps extract_landmarks to process a batch of image paths.
    """
    def __init__(self, extract_func):
        self.extract_func = extract_func
    
    def process_batch(self, image_paths_batch: List[str]) -> List[Dict]:
        """
        Processes a batch of images, returning a list of dicts:
        {'image_path': str, 'label': 'awake'/'drowsy', 'landmarks': [ {x,y,z}, ... ]}
        """
        batch_results = []
        for image_path in image_paths_batch:
            try:
                landmarks = self.extract_func(image_path)
                if landmarks:
                    # Determine label from path: folder name 'awake' or 'drowsy'
                    label = 'drowsy' if 'drowsy' in image_path.lower() else 'awake'
                    batch_results.append({
                        'image_path': image_path,
                        'landmarks': landmarks,
                        'label': label
                    })
            except Exception as e:
                print(f"⚠️ Error processing {image_path}: {e}")  # Log exceptions per image¹⁴
        return batch_results

def process_with_threading(image_paths, extract_landmarks_func, batch_size=500, num_threads=4, phase_name=""):
    """
    Processes all image_paths by dividing into batches of size batch_size.
    Uses ThreadPoolExecutor with num_threads parallel workers.
    Returns (all_landmarks_list, DataFrame).
    """
    print(f"🧵 Starting {phase_name} thread-based processing:")  
    print(f"   📊 Images to process: {len(image_paths)}")  
    print(f"   📦 Batch size: {batch_size}, Threads: {num_threads}")  # Info log
    
    # Create batches as lists of file paths
    batches = [image_paths[i:i + batch_size] for i in range(0, len(image_paths), batch_size)]
    print(f"📦 Created {len(batches)} batches")
    
    start_time = time.time()
    all_landmarks = []
    
    # Launch ThreadPoolExecutor
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
        processor = LandmarkProcessor(extract_landmarks_func)
        future_to_idx = {
            executor.submit(processor.process_batch, batch): idx
            for idx, batch in enumerate(batches)
        }
        
        for future in concurrent.futures.as_completed(future_to_idx):
            batch_idx = future_to_idx[future]
            try:
                batch_result = future.result()
                all_landmarks.extend(batch_result)
                elapsed = time.time() - start_time
                rate = len(all_landmarks) / elapsed if elapsed > 0 else 0
                print(f"✅ {phase_name} Batch {batch_idx+1}/{len(batches)} done "
                      f"({len(batch_result)} images, {rate:.1f} images/sec)")  # Log per-batch speed¹⁵
            except Exception as e:
                print(f"❌ {phase_name} Batch {batch_idx+1} failed: {e}")
    
    total_time = time.time() - start_time
    print(f"🧵 {phase_name} thread-based extraction complete!")
    print(f"   📊 Processed: {len(all_landmarks)} images")
    print(f"   ⏱️ Time: {total_time:.2f} seconds, Speed: {len(all_landmarks)/total_time:.2f} images/sec")  # Final stats
    
    # Convert to DataFrame
    def convert_single(item):
        row = {'image_path': item['image_path'], 'label': item['label']}
        for i, lm in enumerate(item['landmarks']):
            row[f'landmark_{i}_x'] = lm['x']
            row[f'landmark_{i}_y'] = lm['y']
            row[f'landmark_{i}_z'] = lm['z']
        # Compute EAR/MAR if full landmarks
        if len(item['landmarks']) >= 468:
            # Left eye points indices, Right eye indices
            left_idx = [362, 385, 387, 263, 373, 380]
            right_idx = [33, 160, 158, 133, 153, 144]
            def ear(points):
                v1 = np.linalg.norm(np.array([
                    item['landmarks'][points[1]]['x'] - item['landmarks'][points[5]]['x'],
                    item['landmarks'][points[1]]['y'] - item['landmarks'][points[5]]['y']
                ]))
                v2 = np.linalg.norm(np.array([
                    item['landmarks'][points[2]]['x'] - item['landmarks'][points[4]]['x'],
                    item['landmarks'][points[2]]['y'] - item['landmarks'][points[4]]['y']
                ]))
                h = np.linalg.norm(np.array([
                    item['landmarks'][points[0]]['x'] - item['landmarks'][points[3]]['x'],
                    item['landmarks'][points[0]]['y'] - item['landmarks'][points[3]]['y']
                ]))
                return (v1 + v2) / (2.0 * h) if h > 0 else 0.0
            left_ear = ear(left_idx)
            right_ear = ear(right_idx)
            row['left_eye_ear'] = left_ear
            row['right_eye_ear'] = right_ear
            row['avg_eye_ear'] = (left_ear + right_ear) / 2.0
            
            # Mouth AR (using first 6 mouth indices)
            mp_idx = [61, 84, 17, 314, 405, 320]  # 6-point indices
            v1 = np.linalg.norm(np.array([
                item['landmarks'][mp_idx[1]]['x'] - item['landmarks'][mp_idx[5]]['x'],
                item['landmarks'][mp_idx[1]]['y'] - item['landmarks'][mp_idx[5]]['y']
            ]))
            v2 = np.linalg.norm(np.array([
                item['landmarks'][mp_idx[2]]['x'] - item['landmarks'][mp_idx[4]]['x'],
                item['landmarks'][mp_idx[2]]['y'] - item['landmarks'][mp_idx[4]]['y']
            ]))
            h_m = np.linalg.norm(np.array([
                item['landmarks'][mp_idx[0]]['x'] - item['landmarks'][mp_idx[3]]['x'],
                item['landmarks'][mp_idx[0]]['y'] - item['landmarks'][mp_idx[3]]['y']
            ]))
            row['mouth_aspect_ratio'] = (v1 + v2) / (2.0 * h_m) if h_m > 0 else 0.0
        return row
    
    csv_rows = [convert_single(item) for item in all_landmarks]
    landmark_df = pd.DataFrame(csv_rows)
    print(f"🔄 Converted landmarks to CSV: {landmark_df.shape} rows/columns")  # Conversion log¹⁸
    
    return all_landmarks, landmark_df


INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1749084788.683713 3326626 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


W0000 00:00:1749084788.690886 3326626 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


In [5]:
def create_train_test_splits(csv_df, train_ratio=0.8, test_ratio=0.2, random_state=42):
    """
    Splits a DataFrame with a 'label' column into stratified train and test sets only
    """
    print(f"📊 Creating train/test splits: "
          f"Train {train_ratio*100:.1f}%, Test {test_ratio*100:.1f}%")
    
    if abs(train_ratio + test_ratio - 1.0) > 1e-3:
        raise ValueError("Ratios must sum to 1.0")
    
    # Single split: train vs test
    train_df, test_df = train_test_split(
        csv_df,
        test_size=test_ratio,
        random_state=random_state,
        stratify=csv_df['label']
    )
    
    print(f"✅ Splits: Train {len(train_df)} ({len(train_df)/len(csv_df)*100:.1f}%), "
          f"Test {len(test_df)} ({len(test_df)/len(csv_df)*100:.1f}%)")
    
    # Show distribution
    for name, df_split in [("Train", train_df), ("Test", test_df)]:
        print(f"   {name} label dist: {df_split['label'].value_counts().to_dict()}")
    
    return train_df, test_df

def save_splits_to_files(train_df, test_df, prefix="landmarks"):
    """
    Saves DataFrames to CSV files named _train.csv and _test.csv
    """
    filenames = {}
    for split_name, df in [("train", train_df), ("test", test_df)]:
        fname = f"{prefix}_{split_name}.csv"
        df.to_csv(fname, index=False)
        filenames[split_name] = fname
        print(f"💾 {split_name.capitalize()} saved: {fname} ({len(df)} rows)")
    return filenames

In [6]:
def run_two_phase_processing(createml_dir, extract_landmarks_func,
                             small_batch_size=10, full_batch_size=500,
                             num_threads=4, create_splits=True,
                             train_ratio=0.7, test_ratio=0.2, val_ratio=0.1):
    """
    Wrapper that:
      1. Processes a small batch (first 10 images) to validate pipeline.
      2. Waits for user confirmation.
      3. Processes the full dataset with multithreading.
      4. Saves JSON/CSV for small batch and full results.
      5. Optionally creates train/test/val CSV splits for full results.
    """
    print("🚀 Starting two-phase landmark processing with splits...")
    # Gather all image paths under createml_dir recursively
    all_image_paths = []
    for root, dirs, files in os.walk(createml_dir):
        for f in files:
            if f.lower().endswith(('.jpg', '.jpeg', '.png')):
                all_image_paths.append(os.path.join(root, f))
    print(f"📁 Found {len(all_image_paths)} images to process")
    
    # Phase 1: small batch
    small_paths = all_image_paths[:small_batch_size]
    small_landmarks, small_df = process_with_threading(
        small_paths, extract_landmarks_func,
        batch_size=min(5, small_batch_size),
        num_threads=min(2, num_threads),
        phase_name="SMALL"
    )
    if small_landmarks:
        with open("landmarks_small.json", 'w') as f:
            json.dump(small_landmarks, f, indent=2)
        small_df.to_csv("landmarks_small.csv", index=False)
        print(f"✅ Small batch JSON/CSV saved ({len(small_landmarks)} images)")
    else:
        print("❌ No landmarks from small batch, exiting.")
        return None
    
    cont = input("Proceed with full dataset processing? (y/n): ").strip().lower()
    if cont != 'y':
        print("⏹️ Aborting after small batch.")
        return None
    
    # Phase 2: full dataset
    full_landmarks, full_df = process_with_threading(
        all_image_paths, extract_landmarks_func,
        batch_size=full_batch_size,
        num_threads=num_threads,
        phase_name="FULL"
    )
    if full_landmarks:
        with open("landmarks_full.json", 'w') as f:
            json.dump(full_landmarks, f, indent=2)
        if create_splits and len(full_df) > 10:
            try:
                train_df, test_df, val_df = create_train_test_val_splits(
                    full_df, train_ratio, test_ratio, val_ratio
                )
                split_files = save_splits_to_files(train_df, test_df, val_df, "landmarks")
                print("📂 Created splits:")
                for k, v in split_files.items():
                    print(f"   {k}: {v}")
            except Exception as e:
                print(f"⚠️ Splitting error: {e}. Saving full CSV instead.")
                full_df.to_csv("landmarks_full.csv", index=False)
        else:
            full_df.to_csv("landmarks_full.csv", index=False)
            print(f"💾 Full dataset CSV saved: landmarks_full.csv ({len(full_df)})")
        print(f"✅ Full processing complete: {len(full_landmarks)} images extracted")
    else:
        print("❌ No landmarks extracted from full dataset.")


In [7]:
if __name__ == "__main__":
    # Phase 1: Load Hugging Face dataset and prepare CreateML folders
    dataset = load_dataset("akahana/Driver-Drowsiness-Dataset")
    print(f"Train size: {len(dataset['train'])}, Test size: {len(dataset['test'])}")
    
    # Start with smaller subset for testing
    print("🧪 Testing with subset first...")
    small_dataset = {
        'train': dataset['train'].select(range(min(100, len(dataset['train'])))),
        'test': dataset['test'].select(range(min(50, len(dataset['test']))))
    }
    
    # No validation split parameter needed
    prepare_createml_dataset(small_dataset, output_dir="createml_data")
    
    # Phase 2: Run two-phase landmark extraction with train/test splits only
    run_two_phase_processing("createml_data", extract_landmarks,
                             small_batch_size=3,
                             full_batch_size=20,
                             num_threads=1,
                             create_splits=True,
                             train_ratio=0.8,  # Increased since no validation
                             test_ratio=0.2)   # Only train/test now
    
    # Clean up the global MediaPipe instance when done
    face_mesh.close()
    print("✅ Processing complete!")

Train size: 33434, Test size: 8359
🧪 Testing with subset first...
🚀 Starting CreateML dataset preparation...
📁 Created directory: createml_data/train/awake
📁 Created directory: createml_data/train/drowsy
✅ Dataset preparation complete!
📊 Total images saved: 100
📂 Dataset location: createml_data
Train: 100 images (Awake: 58, Drowsy: 42)
🚀 Starting two-phase landmark processing with splits...


W0000 00:00:1749084794.888746 3326625 landmark_projection_calculator.cc:186] Using NORM_RECT without IMAGE_DIMENSIONS is only supported for the square ROI. Provide IMAGE_DIMENSIONS or use PROJECTION_MATRIX.


📁 Found 33434 images to process
🧵 Starting SMALL thread-based processing:
   📊 Images to process: 3
   📦 Batch size: 3, Threads: 1
📦 Created 1 batches
✅ SMALL Batch 1/1 done (3 images, 23.8 images/sec)
🧵 SMALL thread-based extraction complete!
   📊 Processed: 3 images
   ⏱️ Time: 0.13 seconds, Speed: 23.78 images/sec
🔄 Converted landmarks to CSV: (3, 1440) rows/columns
✅ Small batch JSON/CSV saved (3 images)
🧵 Starting FULL thread-based processing:
   📊 Images to process: 33434
   📦 Batch size: 20, Threads: 1
📦 Created 1672 batches
✅ FULL Batch 1/1672 done (20 images, 22.7 images/sec)
✅ FULL Batch 2/1672 done (20 images, 24.3 images/sec)
✅ FULL Batch 3/1672 done (20 images, 24.8 images/sec)
✅ FULL Batch 4/1672 done (20 images, 25.0 images/sec)
✅ FULL Batch 5/1672 done (20 images, 24.7 images/sec)
✅ FULL Batch 6/1672 done (20 images, 24.2 images/sec)
✅ FULL Batch 7/1672 done (20 images, 24.2 images/sec)
✅ FULL Batch 8/1672 done (20 images, 24.4 images/sec)
✅ FULL Batch 9/1672 done (20 i