In [2]:
import cv2
import mediapipe as mp
import numpy as np
import os
import random

In [3]:
# Change MediaPipe initialization to use Holistic instead of just Hands
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

In [4]:
def extract_keypoints(frame, hands):
    """
    Process the frame through MediaPipe to extract hand landmarks.
    Returns a flattened NumPy array of keypoints (x, y, z for each landmark) or None.
    """
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(frame_rgb)
    if results.multi_hand_landmarks:
        landmarks = results.multi_hand_landmarks[0]
        keypoints = []
        for lm in landmarks.landmark:
            keypoints.extend([lm.x, lm.y, lm.z])
        return np.array(keypoints)
    return None

In [8]:
def augment_jitter(sequence, noise_factor=0.02):
    """Add Gaussian noise to the keypoint sequence."""
    noise = np.random.normal(0, noise_factor, sequence.shape)
    return sequence + noise

In [9]:
def augment_horizontal_flip(sequence):
    """Flip keypoints horizontally. Assumes x-coordinates are in [0, 1]."""
    flipped = sequence.copy()
    # Flip x-coordinate: 1 - x for each landmark (every 3 values starting at index 0)
    for i in range(0, sequence.shape[1], 3):
        flipped[:, i] = 1 - flipped[:, i]
    return flipped

In [10]:
def augment_random_rotation(sequence, max_angle=10):
    """Rotate keypoints around the center by a random small angle in degrees."""
    angle = np.deg2rad(random.uniform(-max_angle, max_angle))
    cos_val = np.cos(angle)
    sin_val = np.sin(angle)
    rotated = sequence.copy()
    # For each frame, rotate x and y for every landmark
    for t in range(sequence.shape[0]):
        for i in range(0, sequence.shape[1], 3):
            x, y = sequence[t, i], sequence[t, i+1]
            # Rotate around the center (0.5, 0.5) assuming normalized coords
            x_centered, y_centered = x - 0.5, y - 0.5
            x_new = x_centered * cos_val - y_centered * sin_val + 0.5
            y_new = x_centered * sin_val + y_centered * cos_val + 0.5
            rotated[t, i] = x_new
            rotated[t, i+1] = y_new
    return rotated

In [11]:
def augment_temporal_warp(sequence, min_stride=1, max_stride=2):
    """
    Randomly sample the sequence with a stride between min_stride and max_stride.
    This simulates temporal speed variations.
    """
    stride = random.randint(min_stride, max_stride)
    warped = sequence[::stride]
    # If the warped sequence is too short, pad it with the last frame
    while warped.shape[0] < sequence.shape[0]:
        warped = np.vstack([warped, warped[-1]])
    # Or truncate if too long
    return warped[:sequence.shape[0]]

In [12]:
def augment_sequence(sequence, num_augments=5):
    """
    Apply a combination of augmentation techniques to a keypoint sequence.
    Returns a list of augmented sequences.
    """
    augmented_sequences = []
    for _ in range(num_augments):
        aug_seq = sequence.copy()
        # Randomly decide which augmentations to apply
        if random.choice([True, False]):
            aug_seq = augment_jitter(aug_seq, noise_factor=0.02)
        if random.choice([True, False]):
            aug_seq = augment_horizontal_flip(aug_seq)
        if random.choice([True, False]):
            aug_seq = augment_random_rotation(aug_seq, max_angle=10)
        if random.choice([True, False]):
            aug_seq = augment_temporal_warp(aug_seq, min_stride=1, max_stride=2)
        augmented_sequences.append(aug_seq)
    return augmented_sequences

In [13]:
def record_gesture(gesture_label, num_samples=30, sequence_length=30, 
                   output_dir="data/raw", video_dir="videos"):
    """
    Capture keypoint sequences for a given gesture.
    - gesture_label: string label for the gesture (e.g., 'hello')
    - num_samples: number of samples to record (default: 30)
    - sequence_length: number of frames per sample sequence
    - output_dir: directory to save the keypoint sequences
    - video_dir: directory to save the raw video recordings
    """
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(video_dir, exist_ok=True)
    
    cap = cv2.VideoCapture(0)
    mp_hands = mp.solutions.hands
    hands = mp_hands.Hands(static_image_mode=False,
                           max_num_hands=1,
                           min_detection_confidence=0.7)
    
    sample_count = 0
    print(f"Starting data collection for gesture: '{gesture_label}'")
    print("Press 's' to start recording a sample, or 'q' to quit.")

    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    
    while sample_count < num_samples:
        ret, frame = cap.read()
        if not ret:
            continue
        
        # Display instructions on the frame
        cv2.putText(frame, f"Gesture: {gesture_label} | Sample {sample_count+1}/{num_samples}",
                    (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
        cv2.imshow("Data Collection", frame)
        key = cv2.waitKey(1) & 0xFF
        
        if key == ord('s'):
            print("Recording sample...")
            sequence = []
            # Setup video writer to save the raw video
            video_filename = os.path.join(video_dir, f"{gesture_label}_{sample_count+1}.avi")
            height, width, _ = frame.shape
            out = cv2.VideoWriter(video_filename, fourcc, 20.0, (width, height))
            
            while len(sequence) < sequence_length:
                ret, frame = cap.read()
                if not ret:
                    continue
                out.write(frame)  # Save frame to video file
                keypoints = extract_keypoints(frame, hands)
                # Only add frames where keypoints are detected
                if keypoints is not None:
                    sequence.append(keypoints)
                cv2.putText(frame, f"Recording... {len(sequence)}/{sequence_length}",
                            (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                cv2.imshow("Data Collection", frame)
                if cv2.waitKey(1) & 0xFF == ord('q'):
                    break
            out.release()  # Finish saving the video
            
            if len(sequence) == sequence_length:
                sequence = np.array(sequence)
                filename = os.path.join(output_dir, f"{gesture_label}_{sample_count+1}.npy")
                np.save(filename, sequence)
                print(f"Saved sample {sample_count+1} as {filename}")
                print(f"Saved raw video as {video_filename}")
                sample_count += 1
            else:
                print("Incomplete sample, discarding...")
        elif key == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()
    hands.close()
    print("Data collection complete.")

In [15]:
def augment_data(input_dir="data/raw", output_dir="data/processed", num_augments=5):
    """
    Process all .npy files in the input directory, apply multiple augmentation techniques,
    and save the augmented sequences in the output directory.
    """
    os.makedirs(output_dir, exist_ok=True)
    for file in os.listdir(input_dir):
        if file.endswith(".npy"):
            filepath = os.path.join(input_dir, file)
            sequence = np.load(filepath)
            augmented_sequences = augment_sequence(sequence, num_augments=num_augments)
            for idx, aug_seq in enumerate(augmented_sequences):
                new_filename = os.path.join(output_dir, f"{file.split('.')[0]}_aug{idx+1}.npy")
                np.save(new_filename, aug_seq)
                print(f"Saved augmented data: {new_filename}")

In [16]:
BASE_DATA_DIR = "../data"
RAW_DATA_DIR = os.path.join(BASE_DATA_DIR, "raw")
PROCESSED_DATA_DIR = os.path.join(BASE_DATA_DIR, "processed")
VIDEO_DIR = os.path.join(BASE_DATA_DIR, "videos")

In [17]:
gesture_label = input("Enter gesture label (e.g., hello, bye, thankyou): ").strip()
print("We will record 30 samples for this gesture.")
record_gesture(gesture_label, num_samples=30, output_dir=RAW_DATA_DIR, video_dir=VIDEO_DIR)

augment_choice = input("Do you want to augment the data? (y/n): ").strip().lower()
if augment_choice == 'y':
    augment_data(input_dir=RAW_DATA_DIR, output_dir=PROCESSED_DATA_DIR, num_augments=5)
    print("Data augmentation complete.")

We will record 30 samples for this gesture.


AttributeError: module 'cv2' has no attribute 'VideoCapture'