In [1]:
import cv2
import mediapipe as mp
import numpy as np
import os
import time
import random
from scipy.spatial import distance

In [2]:
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles

In [3]:
def extract_keypoints(results):
    """
    Extract and combine keypoints from MediaPipe Holistic results.
    
    Returns a flattened array containing:
    1. Left hand landmarks (if detected)
    2. Right hand landmarks (if detected)
    3. Pose landmarks (upper body only)
    4. Calculated relative positions and angles
    """
    # Extract left hand landmarks (21 landmarks x 3 coordinates = 63 values)
    lh = np.zeros(63) if results.left_hand_landmarks is None else \
         np.array([[lm.x, lm.y, lm.z] for lm in results.left_hand_landmarks.landmark]).flatten()
    
    # Extract right hand landmarks (21 landmarks x 3 coordinates = 63 values)
    rh = np.zeros(63) if results.right_hand_landmarks is None else \
         np.array([[lm.x, lm.y, lm.z] for lm in results.right_hand_landmarks.landmark]).flatten()
    
    # Extract pose landmarks (upper body only - 11 landmarks x 3 coordinates = 33 values)
    # We use landmarks for shoulders, elbows, wrists, hips, and neck
    upper_body_indices = [0, 11, 12, 13, 14, 15, 16, 23, 24]  # Key upper body landmarks
    pose = np.zeros(len(upper_body_indices) * 3)
    
    if results.pose_landmarks:
        landmarks = results.pose_landmarks.landmark
        for i, idx in enumerate(upper_body_indices):
            if idx < len(landmarks):
                pose[i*3:(i*3)+3] = [landmarks[idx].x, landmarks[idx].y, landmarks[idx].z]
    
    # Calculate relative positions (hands to shoulders)
    relative_features = []
    
    if results.pose_landmarks and (results.left_hand_landmarks or results.right_hand_landmarks):
        # Reference points (shoulders)
        left_shoulder = np.array([landmarks[11].x, landmarks[11].y, landmarks[11].z]) if 11 < len(landmarks) else np.zeros(3)
        right_shoulder = np.array([landmarks[12].x, landmarks[12].y, landmarks[12].z]) if 12 < len(landmarks) else np.zeros(3)
        
        # Hand center points (if detected)
        if results.left_hand_landmarks:
            left_hand_center = np.mean(np.array([[lm.x, lm.y, lm.z] for lm in results.left_hand_landmarks.landmark]), axis=0)
            # Distance from left hand to left shoulder
            relative_features.extend(left_hand_center - left_shoulder)
        else:
            relative_features.extend(np.zeros(3))
        
        if results.right_hand_landmarks:
            right_hand_center = np.mean(np.array([[lm.x, lm.y, lm.z] for lm in results.right_hand_landmarks.landmark]), axis=0)
            # Distance from right hand to right shoulder
            relative_features.extend(right_hand_center - right_shoulder)
        else:
            relative_features.extend(np.zeros(3))
            
        # Distance between hands (if both detected)
        if results.left_hand_landmarks and results.right_hand_landmarks:
            hand_distance = right_hand_center - left_hand_center
            relative_features.extend(hand_distance)
        else:
            relative_features.extend(np.zeros(3))
    else:
        # Add placeholder zeros if landmarks aren't detected
        relative_features = np.zeros(9)  # 3 sets of x,y,z coordinates
    
    # Combine all features
    return np.concatenate([lh, rh, pose, relative_features])



In [4]:
def calculate_velocity(current_frame, previous_frame, time_delta=1.0):
    """Calculate velocity features between two frames of landmarks"""
    if previous_frame is None:
        return np.zeros(len(current_frame))
    
    # Simple velocity calculation (displacement / time)
    return (current_frame - previous_frame) / time_delta

In [5]:
def extract_trajectory_features(sequence, landmark_indices=[0, 9, 13, 17]):
    """
    Extract trajectory-based features for specific landmarks across frames.
    
    Args:
        sequence: Array of shape (frames, features) containing landmark positions
        landmark_indices: Indices of key landmarks to track (e.g., wrist, fingertips)
    
    Returns:
        Trajectory features including direction changes, curvature, and path length
    """
    trajectory_features = []
    
    # For each key landmark
    for idx in landmark_indices:
        base_idx = idx * 3  # Each landmark has x,y,z
        
        # Extract trajectory of this landmark across all frames
        x_pos = sequence[:, base_idx]
        y_pos = sequence[:, base_idx + 1]
        z_pos = sequence[:, base_idx + 2]
        
        # Calculate path length (cumulative distance traveled)
        dx = np.diff(x_pos)
        dy = np.diff(y_pos)
        dz = np.diff(z_pos)
        path_length = np.sum(np.sqrt(dx**2 + dy**2 + dz**2))
        
        # Count direction changes (when velocity changes sign)
        direction_changes_x = np.sum(np.abs(np.diff(np.sign(dx))))
        direction_changes_y = np.sum(np.abs(np.diff(np.sign(dy))))
        direction_changes = direction_changes_x + direction_changes_y
        
        # Calculate average curvature (needs at least 3 points)
        curvature = 0
        if len(x_pos) > 2:
            # Approximate curvature using finite differences
            dx_dt = np.gradient(x_pos)
            dy_dt = np.gradient(y_pos)
            d2x_dt2 = np.gradient(dx_dt)
            d2y_dt2 = np.gradient(dy_dt)
            curvature = np.mean(np.abs(dx_dt*d2y_dt2 - dy_dt*d2x_dt2) / 
                               (dx_dt**2 + dy_dt**2)**(3/2))
            
        trajectory_features.extend([path_length, direction_changes, curvature])
    
    return np.array(trajectory_features)

In [6]:
def extract_dynamics_features(sequence):
    """
    Calculate acceleration and jerk features to capture motion dynamics.
    """
    # Get position data (first half of features if velocity is included)
    positions = sequence[:, :sequence.shape[1]//2] if sequence.shape[1] > 200 else sequence
    
    # Calculate velocities (first derivative)
    velocities = np.gradient(positions, axis=0)
    
    # Calculate accelerations (second derivative)
    accelerations = np.gradient(velocities, axis=0)
    
    # Calculate jerk (third derivative)
    jerk = np.gradient(accelerations, axis=0)
    
    # Compute statistical features from these signals
    acc_features = []
    
    # For acceleration
    acc_mean = np.mean(np.linalg.norm(accelerations, axis=1))
    acc_std = np.std(np.linalg.norm(accelerations, axis=1))
    acc_max = np.max(np.linalg.norm(accelerations, axis=1))
    
    # For jerk - higher derivatives capture suddenness of movements
    jerk_mean = np.mean(np.linalg.norm(jerk, axis=1))
    jerk_std = np.std(np.linalg.norm(jerk, axis=1))
    jerk_max = np.max(np.linalg.norm(jerk, axis=1))
    
    # Combine features
    acc_features = [acc_mean, acc_std, acc_max, jerk_mean, jerk_std, jerk_max]
    
    return np.array(acc_features)

In [7]:
def extract_hand_shape_dynamics(results):
    """
    Extract features related to hand shape changes over time.
    """
    shape_features = []
    
    # If we have hand landmarks
    if results.right_hand_landmarks:
        landmarks = results.right_hand_landmarks.landmark
        
        # 1. Calculate finger spread (distance between fingertips)
        fingertips = [4, 8, 12, 16, 20]  # Thumb, index, middle, ring, pinky
        fingertip_positions = [np.array([landmarks[idx].x, landmarks[idx].y, landmarks[idx].z]) 
                              for idx in fingertips]
        
        # Measure spread: sum of distances between adjacent fingertips
        finger_spread = 0
        for i in range(len(fingertips)-1):
            finger_spread += np.linalg.norm(fingertip_positions[i] - fingertip_positions[i+1])
        
        # 2. Calculate palm area (approximated by triangulation)
        palm_points = [0, 1, 5, 9, 13, 17]  # Wrist and knuckles
        palm_positions = [np.array([landmarks[idx].x, landmarks[idx].y]) 
                         for idx in palm_points]
        # Approximate area using cross-product of vectors
        palm_area = 0
        for i in range(1, len(palm_points)-1):
            v1 = palm_positions[i] - palm_positions[0]
            v2 = palm_positions[i+1] - palm_positions[0]
            palm_area += 0.5 * abs(np.cross(v1, v2))
        
        # 3. Thumb-index pinch distance (important for many signs)
        thumb_tip = np.array([landmarks[4].x, landmarks[4].y, landmarks[4].z])
        index_tip = np.array([landmarks[8].x, landmarks[8].y, landmarks[8].z])
        pinch_distance = np.linalg.norm(thumb_tip - index_tip)
        
        shape_features = [finger_spread, palm_area, pinch_distance]
    else:
        shape_features = [0, 0, 0]  # Default values if no hand detected
        
    return np.array(shape_features)

In [8]:
def extract_frequency_features(sequence, n_components=5):
    """
    Extract frequency domain features using Fast Fourier Transform.
    Useful for capturing repetitive patterns in sign movements.
    """
    # Get only hand landmark positions
    hand_landmarks = sequence[:, :126]  # Assuming first 126 values are hand landmarks
    
    # Initialize arrays for frequency features
    fft_features = []
    
    # For computational efficiency, analyze only a subset of landmarks
    key_landmarks = [0, 4, 8, 12, 16, 20]  # Wrist, thumb tip, and fingertips
    
    for lm in key_landmarks:
        base_idx = lm * 3  # Each landmark has x,y,z
        
        # Extract movement for this landmark
        x_pos = hand_landmarks[:, base_idx]
        y_pos = hand_landmarks[:, base_idx + 1]
        
        # Apply FFT to get frequency components
        fft_x = np.abs(np.fft.rfft(x_pos))
        fft_y = np.abs(np.fft.rfft(y_pos))
        
        # Get dominant frequencies (largest magnitude components)
        # This captures the main rhythmic elements of the movement
        top_k_x = fft_x.argsort()[-n_components:][::-1]
        top_k_y = fft_y.argsort()[-n_components:][::-1]
        
        # Frequency values
        freq_x = top_k_x / len(x_pos)
        freq_y = top_k_y / len(y_pos)
        
        # Amplitude values (normalized)
        amp_x = fft_x[top_k_x] / np.max(fft_x) if np.max(fft_x) > 0 else fft_x[top_k_x]
        amp_y = fft_y[top_k_y] / np.max(fft_y) if np.max(fft_y) > 0 else fft_y[top_k_y]
        
        # Add these frequency features
        fft_features.extend(np.concatenate([freq_x, freq_y, amp_x, amp_y]))
    
    return np.array(fft_features)

In [9]:
def extract_relative_movement(sequence):
    """
    Extract features that describe the relationship between left and right hand movements.
    """
    # Separate left and right hand data
    lh = sequence[:, :63]  # First 63 features are left hand
    rh = sequence[:, 63:126]  # Next 63 features are right hand
    
    # Calculate hand centers (average position of all landmarks)
    lh_center = np.mean(lh.reshape(lh.shape[0], -1, 3), axis=1)
    rh_center = np.mean(rh.reshape(rh.shape[0], -1, 3), axis=1)
    
    # Calculate relative features
    relative_features = []
    
    # 1. Movement correlation between hands
    # High correlation means hands move together, negative means opposite movement
    lh_vel = np.gradient(lh_center, axis=0)
    rh_vel = np.gradient(rh_center, axis=0)
    
    # Correlation coefficient between left and right hand velocity
    corr_x = np.corrcoef(lh_vel[:, 0], rh_vel[:, 0])[0, 1] if len(lh_vel) > 1 else 0
    corr_y = np.corrcoef(lh_vel[:, 1], rh_vel[:, 1])[0, 1] if len(lh_vel) > 1 else 0
    corr_z = np.corrcoef(lh_vel[:, 2], rh_vel[:, 2])[0, 1] if len(lh_vel) > 1 else 0
    
    # Replace NaN values with 0
    corr_x = 0 if np.isnan(corr_x) else corr_x
    corr_y = 0 if np.isnan(corr_y) else corr_y
    corr_z = 0 if np.isnan(corr_z) else corr_z
    
    # 2. Relative distance between hands over time
    distances = np.linalg.norm(lh_center - rh_center, axis=1)
    dist_mean = np.mean(distances)
    dist_std = np.std(distances)
    dist_change = np.max(distances) - np.min(distances)  # Total change in distance
    
    # 3. Relative angle between hands
    angle_features = []
    for i in range(len(lh_center)-1):
        # Vector of movement for each hand
        lh_move = lh_center[i+1] - lh_center[i]
        rh_move = rh_center[i+1] - rh_center[i]
        
        # Calculate angle between vectors (if vectors have magnitude)
        lh_mag = np.linalg.norm(lh_move)
        rh_mag = np.linalg.norm(rh_move)
        
        if lh_mag > 0 and rh_mag > 0:
            cos_angle = np.dot(lh_move, rh_move) / (lh_mag * rh_mag)
            # Clip to handle numerical errors
            cos_angle = np.clip(cos_angle, -1.0, 1.0)
            angle = np.arccos(cos_angle)
            angle_features.append(angle)
    
    # Average relative angle (in radians)
    avg_angle = np.mean(angle_features) if angle_features else 0
    
    # Combine all relative movement features
    relative_features = [corr_x, corr_y, corr_z, dist_mean, dist_std, dist_change, avg_angle]
    
    return np.array(relative_features)

In [10]:
def extract_all_features(results, previous_results_buffer=None):
    """
    Extract comprehensive features ensuring consistent dimensions.
    """
    # Extract basic keypoints
    keypoints = extract_keypoints(results)
    
    # If we don't have previous results, pad with zeros to match full feature size
    if previous_results_buffer is None or len(previous_results_buffer) < 1:
        # Create zero velocities and other features to maintain consistent dimensions
        velocities = np.zeros_like(keypoints)
        # Add any other features with appropriate dimensions
        dynamic_features = np.zeros(30)  # Adjust this size to match your other dynamic features
        
        # Combine to ensure consistent dimensions with the full feature set
        return np.concatenate([keypoints, velocities, dynamic_features])
    
    # If we have previous results, calculate all features
    else:
        # Get the most recent result for velocity calculation
        previous_result = previous_results_buffer[-1]
        previous_keypoints = extract_keypoints(previous_result)
        
        # Calculate velocity features
        velocities = calculate_velocity(keypoints, previous_keypoints)
        
        # Calculate other dynamic features - ensure consistent dimensions
        dynamic_features = np.zeros(30)  # Placeholder - replace with your actual features
        
        # Return combined features with consistent dimensions
        return np.concatenate([keypoints, velocities, dynamic_features])

In [11]:
def augment_data(input_dir="../data/raw", output_dir="../data/processed", num_augments=5):
    """
    Process all .npy files in the input directory, apply multiple augmentation techniques,
    and save the augmented sequences in the output directory.
    """
    os.makedirs(output_dir, exist_ok=True)
    
    for file in os.listdir(input_dir):
        if file.endswith(".npy"):
            filepath = os.path.join(input_dir, file)
            sequence = np.load(filepath)
            
            # Original sequence - also save to processed folder
            original_filename = os.path.join(output_dir, file)
            np.save(original_filename, sequence)
            print(f"Saved original sequence: {original_filename}")
            
            # Generate augmented sequences
            augmented_sequences = []
            
            # 1. Add noise (jitter)
            for i in range(2):  # Two versions with different noise levels
                noise_factor = 0.01 + (i * 0.01)  # 0.01 and 0.02
                noisy_seq = sequence.copy()
                # Only add noise to the landmark positions, not to velocity features
                landmarks_size = sequence.shape[1] // 2  # First half is landmarks, second half is velocity
                noise = np.random.normal(0, noise_factor, (sequence.shape[0], landmarks_size))
                noisy_seq[:, :landmarks_size] += noise
                augmented_sequences.append((noisy_seq, f"noise{i+1}"))
            
            # 2. Time warping (speed variation)
            for i in range(2):
                warp_factor = 0.8 + (i * 0.4)  # 0.8 (slower) and 1.2 (faster)
                warped_indices = np.linspace(0, sequence.shape[0]-1, int(sequence.shape[0] * warp_factor))
                warped_indices = np.clip(warped_indices, 0, sequence.shape[0]-1).astype(int)
                warped_seq = sequence[warped_indices].copy()
                
                # Resize to original sequence length
                if warped_seq.shape[0] != sequence.shape[0]:
                    # Use linear interpolation to resize
                    from scipy.interpolate import interp1d
                    x_original = np.linspace(0, 1, warped_seq.shape[0])
                    x_new = np.linspace(0, 1, sequence.shape[0])
                    warped_resized = np.zeros_like(sequence)
                    
                    for j in range(warped_seq.shape[1]):
                        interpolator = interp1d(x_original, warped_seq[:, j], kind='linear')
                        warped_resized[:, j] = interpolator(x_new)
                    
                    warped_seq = warped_resized
                
                augmented_sequences.append((warped_seq, f"warp{i+1}"))
            
            # 3. Spatial translation (small shifts)
            for i in range(1):
                shift_factor = 0.05  # Small spatial shift
                shifted_seq = sequence.copy()
                # Only shift the position coordinates (x,y), not z or velocities
                landmarks_size = sequence.shape[1] // 2
                
                # Create shift values for x and y coordinates
                x_shift = np.random.uniform(-shift_factor, shift_factor)
                y_shift = np.random.uniform(-shift_factor, shift_factor)
                
                # Apply shifts to every 3rd value starting at index 0 (x) and index 1 (y)
                for j in range(0, landmarks_size, 3):
                    shifted_seq[:, j] += x_shift     # shift x coordinates
                    shifted_seq[:, j+1] += y_shift   # shift y coordinates
                
                augmented_sequences.append((shifted_seq, f"shift{i+1}"))
            
            # Save all augmented sequences
            for aug_seq, aug_type in augmented_sequences:
                base_filename = file.split('.')[0]
                new_filename = os.path.join(output_dir, f"{base_filename}_{aug_type}.npy")
                np.save(new_filename, aug_seq)
                print(f"Saved augmented data: {new_filename}")



In [12]:
def record_gesture(gesture_label, num_samples=30, sequence_length=30, 
                   output_dir="../data/raw", video_dir="../data/videos"):
    """
    Improved capture of keypoint sequences for Indian Sign Language gestures.
    Includes both hand and pose data, plus derived features.
    
    Arguments:
    - gesture_label: string label for the gesture (e.g., 'namaste')
    - num_samples: number of samples to record
    - sequence_length: number of frames per sample sequence
    - output_dir: directory to save the keypoint sequences
    - video_dir: directory to save the raw video recordings
    """
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(video_dir, exist_ok=True)
    
    # Open webcam
    cap = cv2.VideoCapture(0)
    
    # Initialize MediaPipe Holistic
    with mp_holistic.Holistic(
        min_detection_confidence=0.5,
        min_tracking_confidence=0.5) as holistic:
        
        sample_count = 0
        print(f"Starting data collection for gesture: '{gesture_label}'")
        print("Press 's' to start recording a sample, or 'q' to quit.")

        fourcc = cv2.VideoWriter_fourcc(*'XVID')
        countdown_active = False
        countdown_timer = 0
        
        while sample_count < num_samples:
            ret, frame = cap.read()
            if not ret:
                continue
            
            # Flip horizontally for a more intuitive mirror view
            frame = cv2.flip(frame, 1)
            
            # Convert to RGB for MediaPipe
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            
            # Process the frame for visualization only (not recording yet)
            results = holistic.process(frame_rgb)
            
            # Draw landmarks for visual feedback
            annotated_frame = frame.copy()
            # Draw pose landmarks
            mp_drawing.draw_landmarks(
                annotated_frame,
                results.pose_landmarks,
                mp_holistic.POSE_CONNECTIONS,
                landmark_drawing_spec=mp_drawing_styles.get_default_pose_landmarks_style())
            
            # Draw hand landmarks
            mp_drawing.draw_landmarks(
                annotated_frame,
                results.left_hand_landmarks,
                mp_holistic.HAND_CONNECTIONS,
                landmark_drawing_spec=mp_drawing_styles.get_default_hand_landmarks_style())
            mp_drawing.draw_landmarks(
                annotated_frame,
                results.right_hand_landmarks,
                mp_holistic.HAND_CONNECTIONS,
                landmark_drawing_spec=mp_drawing_styles.get_default_hand_landmarks_style())
            
            # Display instructions on the frame
            if countdown_active:
                # Show countdown timer
                remaining = int(countdown_timer - time.time())
                if remaining > 0:
                    cv2.putText(annotated_frame, f"Starting in {remaining}...", 
                                (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                else:
                    # Start recording after countdown
                    countdown_active = False
                    print("Recording sample...")
                    sequence = []
                    previous_results_buffer = []
                    
                    # Setup video writer to save the raw video
                    video_filename = os.path.join(video_dir, f"{gesture_label}_{sample_count+1}.avi")
                    height, width, _ = frame.shape
                    out = cv2.VideoWriter(video_filename, fourcc, 20.0, (width, height))
                    
                    recording_start = time.time()
                    
                    # Recording loop
                    while len(sequence) < sequence_length:
                        ret, frame = cap.read()
                        if not ret:
                            continue
                            
                        frame = cv2.flip(frame, 1)
                        out.write(frame)  # Save frame to video file
                        
                        # Process frame with MediaPipe
                        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                        results = holistic.process(frame_rgb)
                        
                        # Extract keypoints
                        results = holistic.process(frame_rgb)
                        if len(previous_results_buffer) >= 5:  # Need some history for dynamics
                            features = extract_all_features(results, previous_results_buffer)
                            sequence.append(features)
                            # Update buffer (keep last 5 frames)
                            previous_results_buffer.pop(0)
                            previous_results_buffer.append(results)
                        else:
                            # Building initial buffer
                            previous_results_buffer.append(results)
                            features = extract_keypoints(results)  # Basic features only
                            sequence.append(features)
                        
                        
                        # Draw landmarks on the frame
                        annotated_frame = frame.copy()
                        mp_drawing.draw_landmarks(annotated_frame, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
                        mp_drawing.draw_landmarks(annotated_frame, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
                        mp_drawing.draw_landmarks(annotated_frame, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
                        
                        # Show recording progress
                        elapsed = time.time() - recording_start
                        cv2.putText(annotated_frame, f"Recording... {len(sequence)}/{sequence_length}",
                                    (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                        cv2.putText(annotated_frame, f"Time: {elapsed:.1f}s", 
                                    (10, 70), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                        
                        cv2.imshow("Data Collection", annotated_frame)
                        if cv2.waitKey(1) & 0xFF == ord('q'):
                            break
                    
                    out.release()  # Finish saving the video
                    
                    # Save the sequence if it's complete
                    if len(sequence) == sequence_length:
                        sequence = np.array(sequence)
                        filename = os.path.join(output_dir, f"{gesture_label}_{sample_count+1}.npy")
                        np.save(filename, sequence)
                        print(f"Saved sample {sample_count+1} as {filename}")
                        print(f"Saved raw video as {video_filename}")
                        sample_count += 1
                        
                        # Show success message with preview time
                        success_start = time.time()
                        while time.time() - success_start < 2:  # 2 second success message
                            cv2.putText(annotated_frame, "Sample recorded successfully!", 
                                        (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
                            cv2.imshow("Data Collection", annotated_frame)
                            cv2.waitKey(1)
                    else:
                        print("Incomplete sample, discarding...")
            else:
                # Normal display mode (not recording)
                cv2.putText(annotated_frame, f"Gesture: {gesture_label} | Sample {sample_count+1}/{num_samples}",
                            (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
                cv2.putText(annotated_frame, "Press 's' to start, 'q' to quit", 
                            (10, 70), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            
            # Show the frame with annotations
            cv2.imshow("Data Collection", annotated_frame)
            key = cv2.waitKey(1) & 0xFF
            
            if key == ord('s') and not countdown_active:
                # Start countdown before recording
                countdown_active = True
                countdown_timer = time.time() + 3  # 3 second countdown
                print("Starting countdown...")
            elif key == ord('q'):
                break

    cap.release()
    cv2.destroyAllWindows()
    print("Data collection complete.")

In [16]:
def extract_from_videos(
    video_dir="../data/videos",
    output_dir="../data/raw",
    sequence_length=30,
):
    """Extract features from videos with consistent dimensions"""
    os.makedirs(output_dir, exist_ok=True)
    video_files = [f for f in os.listdir(video_dir) if f.endswith((".avi", ".mp4"))]
    
    for video_file in video_files:
        video_path = os.path.join(video_dir, video_file)
        print(f"Processing {video_file}...")
        cap = cv2.VideoCapture(video_path)
        
        with mp_holistic.Holistic(
            min_detection_confidence=0.5,
            min_tracking_confidence=0.5) as holistic:
            
            sequence = []
            previous_results_buffer = []
            
            while cap.isOpened():
                ret, frame = cap.read()
                if not ret:
                    break
                
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                results = holistic.process(frame_rgb)
                
                # Always use extract_all_features for consistency
                features = extract_all_features(results, previous_results_buffer)
                sequence.append(features)
                
                # Update buffer
                if len(previous_results_buffer) >= 5:
                    previous_results_buffer.pop(0)
                previous_results_buffer.append(results)
                
                # Check if we have enough frames
                if len(sequence) == sequence_length:
                    # Debug: Check all feature dimensions
                    feature_lengths = [len(feat) for feat in sequence]
                    print(f"Feature dimensions: min={min(feature_lengths)}, max={max(feature_lengths)}")
                    
                    try:
                        # Convert to numpy array - should work now with consistent dimensions
                        sequence_array = np.array(sequence)
                        filename = os.path.join(output_dir, f"{video_file.split('.')[0]}.npy")
                        np.save(filename, sequence_array)
                        print(f"Saved sequence to {filename}, shape: {sequence_array.shape}")
                    except ValueError as e:
                        print(f"Error creating array: {e}")
                        # As fallback, save using object array
                        sequence_array = np.array(sequence, dtype=object)
                        filename = os.path.join(output_dir, f"{video_file.split('.')[0]}_object.npy")
                        np.save(filename, sequence_array)
                        print(f"Saved as object array to {filename}")
                    
                    sequence = []
            
            cap.release()
            print(f"Finished processing video: {video_file}")
    
    print("All videos processed.")

In [14]:
BASE_DATA_DIR = "../data"
RAW_DATA_DIR = os.path.join(BASE_DATA_DIR, "raw")
PROCESSED_DATA_DIR = os.path.join(BASE_DATA_DIR, "processed")
VIDEO_DIR = os.path.join(BASE_DATA_DIR, "videos")

In [None]:
extract_from_videos(
    video_dir=VIDEO_DIR,
    output_dir=RAW_DATA_DIR,
    sequence_length=30,
)

In [None]:
augment_data()