<a href="https://colab.research.google.com/github/Buddika-Kasun/ML/blob/main/Dataset_Creation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ===============================================================
# 📘 SLSL Sentence-Based Dataset Landmark Extraction Pipeline
# Uses MediaPipe Holistic to extract hand, pose, and lip landmarks
# Dataset is structured in Google Drive by sentence folders
# ===============================================================

In [2]:
# --- Install Dependencies ---
!pip install mediapipe opencv-python pandas numpy tqdm
!apt update && apt install -y ffmpeg

import cv2
import mediapipe as mp
import numpy as np
import pandas as pd
import os
import json
from tqdm import tqdm
import shutil

# --- Mount Google Drive ---
from google.colab import drive
drive.mount('/content/drive')

Collecting mediapipe
  Downloading mediapipe-0.10.21-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (9.7 kB)
Collecting numpy
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf<5,>=4.25.3 (from mediapipe)
  Downloading protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting sounddevice>=0.4.4 (from mediapipe)
  Downloading sounddevice-0.5.3-py3-none-any.whl.metadata (1.6 kB)
INFO: pip is looking at multiple versions of opencv-python to determine which version is compatible with other requirements. This could take a while.
Collecting opencv-python
  Downloading opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
INFO: pip is looking at multiple versions of jax to determine which version is compatible with other requirements. T

[33m0% [Working][0m            Get:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
[33m0% [Connecting to archive.ubuntu.com (185.125.190.39)] [Connecting to security.[0m[33m0% [Connecting to archive.ubuntu.com (185.125.190.39)] [Connecting to security.[0m                                                                               Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:3 https://cli.github.com/packages stable InRelease
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [2,086 kB]
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:6 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:8 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launc

KeyboardInterrupt: 

# ===============================================================
# 📁 1. Folder Setup
# ===============================================================

In [None]:
def create_sentence_based_structure():
    """Create necessary folders in Colab workspace"""
    os.makedirs('/content/raw_videos', exist_ok=True)
    os.makedirs('/content/landmarks_data', exist_ok=True)
    os.makedirs('/content/metadata', exist_ok=True)
    print("✅ Folder structure ready!")
    print("📁 /content/raw_videos – for input videos")
    print("📁 /content/landmarks_data – for output .npy files")
    print("📁 /content/metadata – for metadata and mapping files")

# ===============================================================
# 📂 2. Copy Dataset from Google Drive
# ===============================================================

In [None]:
def copy_from_drive(drive_path='/content/drive/MyDrive/SLSL_Dataset'):
    """
    Copy your structured folder from Google Drive into Colab.
    Your Drive should have:
      MyDrive/SLSL_Dataset/
        ├── where_does_it_hurt/
        │   ├── signer_01_rep_1.mp4
        │   └── ...
        ├── i_have_a_headache/
        │   ├── signer_02_rep_1.mp4
        │   └── ...
    """

    if os.path.exists(drive_path):
        shutil.copytree(drive_path, '/content/raw_videos', dirs_exist_ok=True)
        print("✅ Copied entire structure from Google Drive!")
        sentences = os.listdir('/content/raw_videos')
        print(f"📁 Sentence folders copied: {len(sentences)}")
        for sentence in sentences:
            sentence_path = f"/content/raw_videos/{sentence}"
            video_count = len([f for f in os.listdir(sentence_path) if f.endswith('.mp4')])
            print(f"   {sentence}: {video_count} videos")
    else:
        print(f"❌ Folder not found in Google Drive at: {drive_path}")
        print("💡 Please check the folder path and name.")

# ===============================================================
# 🧠 3. Setup MediaPipe Holistic
# ===============================================================

In [None]:
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

def setup_mediapipe():
    """Setup MediaPipe Holistic with balanced accuracy/speed"""
    holistic = mp_holistic.Holistic(
        static_image_mode=False,
        model_complexity=1,
        smooth_landmarks=True,
        enable_segmentation=False,
        refine_face_landmarks=True,
        min_detection_confidence=0.5,
        min_tracking_confidence=0.5
    )
    return holistic

# ===============================================================
# 🎯 4. Extract Landmarks from Frame
# ===============================================================

In [None]:
def extract_frame_landmarks(frame, holistic_model):
    """Extract hand, pose, and lip landmarks from one frame"""
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    frame_rgb.flags.writeable = False
    results = holistic_model.process(frame_rgb)

    landmarks_dict = {
        'left_hand': None,
        'right_hand': None,
        'pose': None,
        'face': None,
        'lip_roi': None,
        'timestamp': None
    }

    # --- Left Hand ---
    if results.left_hand_landmarks:
        left_hand = []
        for landmark in results.left_hand_landmarks.landmark:
            left_hand.extend([landmark.x, landmark.y, landmark.z, landmark.visibility])
        landmarks_dict['left_hand'] = left_hand

    # --- Right Hand ---
    if results.right_hand_landmarks:
        right_hand = []
        for landmark in results.right_hand_landmarks.landmark:
            right_hand.extend([landmark.x, landmark.y, landmark.z, landmark.visibility])
        landmarks_dict['right_hand'] = right_hand

    # --- Pose (upper body) ---
    if results.pose_landmarks:
        pose = []
        upper_body_indices = list(range(25))
        for i in upper_body_indices:
            landmark = results.pose_landmarks.landmark[i]
            pose.extend([landmark.x, landmark.y, landmark.z, landmark.visibility])
        landmarks_dict['pose'] = pose

    # --- Lips from Face Mesh ---
    if results.face_landmarks:
        lip_indices = [
            61, 84, 314, 17, 87, 178, 88, 95, 78, 62, 96, 89,
            146, 91, 181, 76, 184, 74, 183, 42, 13, 82, 81, 80,
            191, 78, 312, 311, 310, 415, 308, 324, 318, 402,
            317, 14, 87, 178
        ]
        lip_landmarks = []
        face_landmarks = []
        for i, landmark in enumerate(results.face_landmarks.landmark):
            face_landmarks.extend([landmark.x, landmark.y, landmark.z, landmark.visibility])
            if i in lip_indices:
                lip_landmarks.extend([landmark.x, landmark.y, landmark.z, landmark.visibility])
        landmarks_dict['face'] = face_landmarks
        landmarks_dict['lip_roi'] = lip_landmarks

    return landmarks_dict

# ===============================================================
# 🎞️ 5. Process a Single Video
# ===============================================================

In [None]:
def process_single_video(video_path, holistic_model, max_frames=None):
    """Process one video and extract landmarks frame by frame"""
    cap = cv2.VideoCapture(video_path)
    frames_data = []
    frame_count = 0

    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    print(f"\n📹 Processing: {os.path.basename(video_path)} ({total_frames} frames)")

    with tqdm(total=total_frames, desc="Extracting landmarks") as pbar:
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            if max_frames and frame_count >= max_frames:
                break

            landmarks = extract_frame_landmarks(frame, holistic_model)
            landmarks['timestamp'] = frame_count / fps
            frames_data.append(landmarks)
            frame_count += 1
            pbar.update(1)

    cap.release()
    print(f"✅ Extracted {len(frames_data)} frames.")
    return frames_data

# ===============================================================
# 🧩 6. Process Entire Dataset (Sentence-based)
# ===============================================================

In [None]:
def process_sentence_based_dataset(raw_videos_root='/content/raw_videos',
                                   output_folder='/content/landmarks_data',
                                   test_mode=False):
    """Process all sentence folders in dataset"""
    metadata = []

    print("🔍 Scanning for sentence folders...")
    sentence_folders = [f for f in os.listdir(raw_videos_root)
                        if os.path.isdir(os.path.join(raw_videos_root, f))]
    print(f"📁 Found {len(sentence_folders)} sentences.")

    for sentence in sentence_folders:
        sentence_path = os.path.join(raw_videos_root, sentence)
        video_files = [f for f in os.listdir(sentence_path)
                       if f.endswith(('.mp4', '.avi', '.mov', '.mkv'))]

        print(f"\n🎬 Sentence: {sentence} – {len(video_files)} videos")
        if test_mode:
            video_files = video_files[:1]
            print("⚙️ TEST MODE: Only 1 video processed per sentence")

        for video_file in video_files:
            video_path = os.path.join(sentence_path, video_file)
            filename_parts = video_file.replace('.mp4', '').split('_')
            signer_id = filename_parts[1] if len(filename_parts) > 1 else "unknown"
            rep_number = filename_parts[3] if len(filename_parts) > 3 else "1"

            try:
                landmarks_seq = process_single_video(video_path, holistic,
                                                     max_frames=50 if test_mode else None)
                if landmarks_seq:
                    output_filename = f"{sentence}_signer_{signer_id}_rep_{rep_number}.npy"
                    output_path = os.path.join(output_folder, output_filename)
                    np.save(output_path, landmarks_seq)

                    left_hand_frames = sum(1 for f in landmarks_seq if f['left_hand'])
                    right_hand_frames = sum(1 for f in landmarks_seq if f['right_hand'])
                    lip_frames = sum(1 for f in landmarks_seq if f['lip_roi'])

                    metadata.append({
                        'landmarks_file': output_filename,
                        'sentence': sentence,
                        'signer_id': signer_id,
                        'rep_number': rep_number,
                        'original_video': video_file,
                        'video_path': video_path,
                        'total_frames': len(landmarks_seq),
                        'left_hand_frames': left_hand_frames,
                        'right_hand_frames': right_hand_frames,
                        'lip_frames': lip_frames,
                        'left_hand_coverage': (left_hand_frames / len(landmarks_seq)) * 100,
                        'right_hand_coverage': (right_hand_frames / len(landmarks_seq)) * 100,
                        'lip_coverage': (lip_frames / len(landmarks_seq)) * 100,
                        'success': True
                    })
                    print(f"💾 Saved landmarks → {output_filename}")
                else:
                    print(f"⚠️ No landmarks extracted in {video_file}")

            except Exception as e:
                print(f"❌ Error processing {video_file}: {str(e)}")
                metadata.append({
                    'landmarks_file': 'FAILED',
                    'sentence': sentence,
                    'signer_id': signer_id,
                    'rep_number': rep_number,
                    'original_video': video_file,
                    'error': str(e),
                    'success': False
                })

    # Save metadata
    if metadata:
        metadata_df = pd.DataFrame(metadata)
        meta_csv = '/content/metadata/sentence_dataset_metadata.csv'
        metadata_df.to_csv(meta_csv, index=False)

        sentence_mapping = {s: i for i, s in enumerate(metadata_df['sentence'].unique())}
        with open('/content/metadata/sentence_mapping.json', 'w') as f:
            json.dump(sentence_mapping, f, indent=2)

        print(f"\n📊 Metadata saved → {meta_csv}")
        return metadata_df, sentence_mapping
    else:
        print("❌ No videos processed successfully!")
        return None, None

# ===============================================================
# 📈 7. Dataset Analysis
# ===============================================================

In [None]:
def analyze_sentence_dataset(metadata_df):
    """Print statistics about processed dataset"""
    print("\n📊 SENTENCE DATASET ANALYSIS")
    print("=" * 50)

    total = len(metadata_df)
    success = metadata_df['success'].sum()
    print(f"Videos processed: {total}, Success: {success} ({success/total*100:.1f}%)")

    sentence_stats = metadata_df[metadata_df['success']]['sentence'].value_counts()
    print("\n📝 Videos per sentence:")
    for s, c in sentence_stats.items():
        print(f"   {s}: {c}")

    signer_stats = metadata_df[metadata_df['success']]['signer_id'].value_counts()
    print("\n👥 Videos per signer:")
    for s, c in signer_stats.items():
        print(f"   Signer {s}: {c}")

# ===============================================================
# 🔍 8. Verify Landmark Files
# ===============================================================

In [None]:
def verify_landmark_files(folder='/content/landmarks_data'):
    """Check a few saved .npy files"""
    print("\n🔍 Verifying sample landmark files...")
    files = [f for f in os.listdir(folder) if f.endswith('.npy')]
    print(f"Found {len(files)} landmark files.")

    for file in files[:3]:
        data = np.load(os.path.join(folder, file), allow_pickle=True)
        print(f"\n📄 {file} – {len(data)} frames")
        if len(data) > 0:
            print("Keys:", list(data[0].keys()))

# ===============================================================
# 🚀 9. Main Pipeline
# ===============================================================

In [None]:
def main_pipeline(test_mode=True):
    print("🚀 STARTING SLSL SENTENCE-BASED DATASET PIPELINE")
    print("=" * 60)

    # Step 1: Folder setup
    create_sentence_based_structure()

    # Step 2: Copy dataset from Google Drive
    copy_from_drive(drive_path='/content/drive/MyDrive/SLSL_Dataset') # ✅ adjust path if needed

    # Step 3: Initialize MediaPipe
    global holistic
    holistic = setup_mediapipe()
    print("✅ MediaPipe Holistic initialized")

    # Step 4: Process dataset
    metadata_df, sentence_mapping = process_sentence_based_dataset(test_mode=test_mode)

    # Step 5: Analyze and verify
    if metadata_df is not None:
        analyze_sentence_dataset(metadata_df)
        verify_landmark_files()
        print("\n🎉 Dataset processing complete!")
        return metadata_df, sentence_mapping
    else:
        print("❌ Dataset creation failed.")
        return None, None

# ===============================================================
# ▶️ 10. Run Pipeline
# ===============================================================

In [None]:
metadata, mapping = main_pipeline(test_mode=True)
# When ready for full run, set test_mode=False
# metadata, mapping = main_pipeline(test_mode=False)