# Requirements

-- bert

In [None]:
!pip install tensorflow

In [None]:
!pip install pytorch-pretrained-bert pytorch-nlp

-- speech

In [None]:
!pip install git+https://github.com/openai/whisper.git
!sudo apt update && sudo apt install ffmpeg -y

In [None]:
!sudo apt-get install portaudio19-dev # Linux-case
!python -m pip install pyaudio
!pip install SpeechRecognition
!pip install pyttsx3
!pip install --upgrade pyttsx3
!pip uninstall pyttsx3
!pip install pyttsx3==2.90
!sudo apt-get install -y espeak
!sudo apt-get update
!pip install noisereduce soundfile
!pip install jiwer

-- multimodality -> #!pip install torch transformers sentence-transformers scikit-learn pandas opencv-python moviepy mediapipe

# LAB 2

# NLP CLASSIFICATION BERT FINETUNING

In [None]:




# We need to add special tokens at the beginning and end of each sentence for BERT to work properly
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences] #sentences is list of sentences we modify it by adding special tokens
labels = df.label.values  #labels list of 0/1 classification for each such sentence


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) # import model

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences] # list of tokens of sentences
print ("Tokenize the first sentence:")
print (tokenized_texts[0])

MAX_LEN = 128
#padding
# maxlen=MAX_LEN: maximum allowed sentence length (e.g., 128, 256).
# If a sentence is longer → truncate. If shorter → pad with 0s.
# dtype="long": output array type (64-bit integers).
# truncating="post": if sequence is too long, cut tokens from the end.
# padding="post": if sequence is too short, add zeros at the end.

input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],#list of token ids
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

attention_masks = []   #Attention mask = tells BERT which tokens are real words and which are just padding.
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)   #a mask of 1s for each token followed by 0s for padding




In [None]:
# Use train_test_split to split our data into train and validation sets for training

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels,
                                                            random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

# Convert all of our data into torch tensors, the required datatype for our model

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

#doing mini batch
# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 32
# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop,
# with an iterator the entire dataset does not need to be loaded into memory
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [None]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top.

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)  #num_labels=2 Defines how many classes we’re predicting.
model.cuda() # to run on GPU If you don’t run this, training will default to CPU (much slower).

In [None]:
# Get all model parameters with their names (e.g., 'bert.encoder.layer.0.attention.self.query.weight')
param_optimizer = list(model.named_parameters())

# These parameters should NOT be decayed (weight decay = L2 regularization)
# 'bias', 'gamma', and 'beta' are common in BERT's LayerNorm and biases.
# We don't want weight decay on them because it hurts performance.
no_decay = ['bias', 'gamma', 'beta']

# Create two groups of parameters for the optimizer:
optimizer_grouped_parameters = [
    # Group 1: Parameters that will have weight decay (regularization applied)
    {
        'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate': 0.01  # Apply L2 regularization
    },
    # Group 2: Parameters that will NOT have weight decay (biases, gamma, beta)
    {
        'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate': 0.0   # No regularization
    }
]


# This variable contains all of the hyperparemeter information our training loop needs
optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=2e-5,
                     warmup=.1)


# Function to calculate the accuracy of predictions against true labels
def flat_accuracy(preds, labels):
    # preds: model output probabilities/logits of shape (num_samples, num_classes)
    # labels: true labels of shape (num_samples,)

    # Step 1: Take the index of the maximum value in each prediction vector
    # This gives us the predicted class for each sample
    pred_flat = np.argmax(preds, axis=1).flatten()  # flatten() ensures it's 1D

    # Step 2: Flatten the labels array to 1D as well
    labels_flat = labels.flatten()

    # Step 3: Compare predictions with labels, sum correct predictions
    correct_predictions = np.sum(pred_flat == labels_flat)

    # Step 4: Divide by total number of samples to get accuracy
    accuracy = correct_predictions / len(labels_flat)

    return accuracy


In [None]:
# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 4

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):


  # Training

  # Set our model to training mode (as opposed to evaluation mode)
  model.train()

  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0

  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # Forward pass
    loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    train_loss_set.append(loss.item())
    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()


    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))


  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Tracking variables
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0

  # Evaluate data for one epoch
  for batch in validation_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_eval_accuracy = flat_accuracy(logits, label_ids)

    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

  print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

same for test and evaluation see from Lab2 ipynb

In [None]:
# Import the metric
from sklearn.metrics import matthews_corrcoef

# Initialize a list to store MCC values for each batch
matthews_set = []

# Loop through each batch of predictions and true labels
for i in range(len(true_labels)):
    # Step 1: Get the predicted class for each sample in the batch
    # np.argmax selects the class with the highest predicted probability
    pred_classes = np.argmax(predictions[i], axis=1).flatten()

    # Step 2: Compute Matthew's correlation coefficient for this batch
    # MCC accounts for true/false positives/negatives and works well for imbalanced datasets
    matthews = matthews_corrcoef(true_labels[i], pred_classes)

    # Step 3: Append the MCC for this batch to the list
    matthews_set.append(matthews)


# LAB 3  SPEECH TRANSLATION

# SPEECH TO TEXT

Builtin whisper BEST accuracy

In [None]:
# Install Whisper + dependencies
!pip install git+https://github.com/openai/whisper.git
!sudo apt update && sudo apt install ffmpeg -y

# Import libraries
import whisper

# Load pre-trained Whisper model
# Options: tiny, base, small, medium, large
model = whisper.load_model("small")

# Upload an audio file (wav, mp3, m4a etc.)
from google.colab import files
uploaded = files.upload()

# Take first uploaded file
audio_file = list(uploaded.keys())[0]

# Transcribe the audio
result = model.transcribe(audio_file)   #can pass optional parameter result = model.transcribe(audio_file, language="en")

# Print the transcription
print("Transcription:\n", result["text"])

Noise Reduction

-- slides

In [None]:
 import noisereduce as nr
 from scipy.io import wavfile

 # 1. Load the noisy audio file and its sample rate
 rate, data = wavfile.read("my_noisy_audio.wav")

 # 2. Select a sample of pure noise
 #(e.g., the first 1 second). This part is crucial!
 noise_clip = data[0:rate]

 # 3. Perform noise reduction
 reduced_noise_data = nr.reduce_noise(y=data, sr=rate, y_noise=noise_clip)

 # 4. Save the clean audio to a new file
 wavfile.write("my_clean_audio.wav", rate, reduced_noise_data)
 print("Noise reduction complete!")

-- assignment

In [None]:
import noisereduce as nr
from scipy.io import wavfile
import numpy as np

# Step 1: Load noisy audio
rate, data = wavfile.read("my_noisy_audio.wav")

# Step 2: Convert stereo to mono if needed
if len(data.shape) > 1:
    data = np.mean(data, axis=1)  # average channels

# Step 3: Normalize to float32 in range [-1, 1]
# Avoids overflow during processing
data = data.astype(np.float32)
data = data / (np.max(np.abs(data)) + 1e-10)  # small epsilon to prevent division by 0

# Step 4: Extract a pure noise segment (first 1 second)
noise_clip = data[0:rate]

# Step 5: Reduce noise
reduced_noise_data = nr.reduce_noise(
    y=data,
    sr=rate,
    y_noise=noise_clip,  # provide explicit noise profile
    prop_decrease=1.0,   # adjust strength of noise reduction
)

# Step 6: Convert back to int16 for WAV file saving
reduced_noise_data = np.int16(reduced_noise_data * 32767)

# Step 7: Save cleaned audio
wavfile.write("my_cleaned_audio.wav", rate, reduced_noise_data)

print("Noise reduction complete!")


NORMALISTION

In [None]:
import librosa
import numpy as np

# Load audio
y, sr = librosa.load("my_noisy_audio.wav", sr=16000)

# Before normalization: max amplitude
print("Before normalization:", np.max(np.abs(y)))

# Normalize to [-1, 1]
y_norm = y / (np.max(np.abs(y)) + 1e-10)

# After normalization
print("After normalization:", np.max(np.abs(y_norm)))


Framing (25ms frames with 10ms hop)

In [None]:
import librosa
import numpy as np

# Load normalized audio
y, sr = librosa.load("my_noisy_audio.wav", sr=16000)
y = y / (np.max(np.abs(y)) + 1e-10)

# Frame parameters
frame_length = int(0.025 * sr)  # 25ms
hop_length = int(0.010 * sr)    # 10ms overlap

# Use librosa's framing
frames = librosa.util.frame(y, frame_length=frame_length, hop_length=hop_length).T
print("Shape of frames:", frames.shape)  # (#frames, frame_length)


Visualization (Waveform + Spectrogram)

In [None]:
import librosa
import librosa.display
import matplotlib.pyplot as plt

# Load audio
y, sr = librosa.load("my_noisy_audio.wav", sr=16000)

# Plot waveform
plt.figure(figsize=(12, 4))
librosa.display.waveshow(y, sr=sr)
plt.title("Audio Waveform")
plt.show()

# Plot spectrogram
D = librosa.stft(y)  # Short-time Fourier transform
S_db = librosa.amplitude_to_db(abs(D), ref=np.max)

plt.figure(figsize=(12, 4))
librosa.display.specshow(S_db, sr=sr, x_axis='time', y_axis='hz')
plt.colorbar(format="%+2.0f dB")
plt.title("Spectrogram")
plt.show()


-- MANUAL

In [None]:
# Import libraries
import speech_recognition as sr  # For speech-to-text conversion
import pyttsx3                  # For text-to-speech conversion

# Step 1: Initialize the recognizer
r = sr.Recognizer()  # This creates a Recognizer object which will process the audio

# Step 2: Define a function to convert text to speech
def SpeakText(command):
    """
    Converts the input text (command) to spoken audio.
    """
    engine = pyttsx3.init()  # Initialize the TTS engine
    engine.say(command)       # Queue the text to speak
    engine.runAndWait()       # Speak the text and wait until finished

# Step 3: Specify the path to your audio file
audio_file = "/content/my_cleaned_audio.wav"  # Replace with your cleaned audio file

# Step 4: Open the audio file and read its content
with sr.AudioFile(audio_file) as source:
    audio_data = r.record(source)  # Load the entire audio file into memory

    # Step 5: Convert speech to text using Google Speech Recognition
    try:
        MyText = r.recognize_google(audio_data)  # Transcribe the audio
        MyText_Cleaned = MyText.lower()          # Convert text to lowercase for consistency
        print("Did you say in after cleaning noisy audio:", MyText_Cleaned)

        # Step 6: Speak the recognized text aloud
        SpeakText(MyText_Cleaned)

    # Step 7: Handle possible errors
    except sr.RequestError as e:
        # API was unreachable or unresponsive
        print("Could not request results; {0}".format(e))
    except sr.UnknownValueError:
        # Speech was unintelligible
        print("Unknown error occurred")


-- ACCURACY

-- LAB3 assignment wer builtin

# WER IMPLEMENTATION

In [None]:
def wer(reference, hypothesis):
    """
    Calculate Word Error Rate (WER) between reference and hypothesis text.

    WER = (S + D + I) / N
    S = substitutions, D = deletions, I = insertions, N = number of words in reference

    Args:
        reference (str): Correct text (ground truth)
        hypothesis (str): Transcribed text

    Returns:
        float: WER as a fraction (0.0 to 1.0)
    """
    # Step 1: Split sentences into lists of words
    ref_words = reference.strip().split()
    hyp_words = hypothesis.strip().split()

    # Step 2: Initialize a matrix (len(ref)+1 x len(hyp)+1) for dynamic programming
    # dp[i][j] = minimum edit distance between first i words of ref and first j words of hyp
    n = len(ref_words)
    m = len(hyp_words)
    dp = [[0] * (m + 1) for _ in range(n + 1)]

    # Step 3: Initialize base cases
    for i in range(n + 1):
        dp[i][0] = i  # i deletions
    for j in range(m + 1):
        dp[0][j] = j  # j insertions

    # Step 4: Fill the matrix using Levenshtein distance
    for i in range(1, n + 1):
        for j in range(1, m + 1):
            if ref_words[i - 1] == hyp_words[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]  # no error if words match
            else:
                substitution = dp[i - 1][j - 1] + 1
                insertion = dp[i][j - 1] + 1
                deletion = dp[i - 1][j] + 1
                dp[i][j] = min(substitution, insertion, deletion)

    # Step 5: Total errors = dp[n][m]
    errors = dp[n][m]

    # Step 6: WER = errors / number of words in reference
    wer_value = errors / n if n > 0 else 0.0

    return wer_value


# -----------------------------
# Example usage
reference_text = "hello world this is a test"
hypothesis_text_noisy = "hello word this is test"
hypothesis_text_cleaned = "hello world this is a test"

print(f"WER for noisy text:   {wer(reference_text, hypothesis_text_noisy) * 100:.2f}%")
print(f"WER for cleaned text: {wer(reference_text, hypothesis_text_cleaned) * 100:.2f}%")


# TEXT TO SPEECH

In [None]:
# Import the text-to-speech library
import pyttsx3

# Step 1: Initialize the TTS engine
engine = pyttsx3.init()  # creates a TTS engine object

# Step 2: Set properties (optional)
engine.setProperty('rate', 150)     # Speech rate (words per minute)
engine.setProperty('volume', 1.0)   # Volume (0.0 to 1.0)
voices = engine.getProperty('voices')
engine.setProperty('voice', voices[0].id)  # Choose a voice (0 = first voice, 1 = second, etc.)

# Step 3: Input text to speak
text_to_speak = "Hello! This is your text-to-speech program. You can type any text here."

# Step 4: Speak the text
engine.say(text_to_speak)

# Step 5: Run the speech engine
engine.runAndWait()

print("Text has been spoken successfully!")


# LAB 4

-- Opencv image preprocessing

-- Object detection

-- image segmentation

-- yolo live video py file

# LAB 5

-- multimodality assignment

-- gestures

In [None]:
"""
gestures.py

Heuristic gesture detectors for MediaPipe-like 21 hand landmarks.

Each function expects `landmarks` to be a sequence of 21 points:
    landmarks[i] == (x, y)  OR (x, y, z)
Coordinates should be in the same units (e.g., normalized image coords or pixels).
Indices follow MediaPipe Hands convention:
    0: wrist
    Thumb: 1..4 (tip = 4)
    Index: 5..8 (tip = 8)
    Middle: 9..12 (tip = 12)
    Ring: 13..16 (tip = 16)
    Pinky: 17..20 (tip = 20)

Heuristics adapted from MediaPipe docs and community examples.
References: MediaPipe Hands & community code.
"""

from typing import Sequence, Tuple
import math

Point = Tuple[float, float]  # (x, y) or (x, y, z) - code uses only x,y

# landmark index constants (MediaPipe)
WRIST = 0
THUMB_TIP = 4
THUMB_IP = 3
THUMB_MCP = 2
INDEX_PIP = 6
INDEX_TIP = 8
MIDDLE_PIP = 10
MIDDLE_TIP = 12
RING_PIP = 14
RING_TIP = 16
PINKY_PIP = 18
PINKY_TIP = 20

TIP_INDICES = [THUMB_TIP, INDEX_TIP, MIDDLE_TIP, RING_TIP, PINKY_TIP]
PIP_INDICES = [THUMB_IP, INDEX_PIP, MIDDLE_PIP, RING_PIP, PINKY_PIP]


def _xy(pt):
    """Return (x,y) from a point that may have 2 or 3 elements."""
    return (pt[0], pt[1])


def _dist(a: Point, b: Point) -> float:
    ax, ay = _xy(a); bx, by = _xy(b)
    return math.hypot(ax - bx, ay - by)


def hand_size(landmarks: Sequence[Tuple[float, float]]) -> float:
    """
    Estimate a scale for the hand using max distance between wrist and tips.
    Used to make thresholds relative to hand size.
    """
    wrist = landmarks[WRIST]
    dists = [_dist(wrist, landmarks[i]) for i in TIP_INDICES]
    maxd = max(dists) if dists else 1.0
    return maxd


def is_finger_extended(landmarks: Sequence[Tuple[float, float]],
                       tip_idx: int, pip_idx: int,
                       wrist_idx: int = WRIST,
                       margin: float = 0.0) -> bool:
    """
    Heuristic: a finger is considered extended if its tip is *farther* from the wrist
    than its pip joint is. margin is an absolute value added to pip->wrist distance
    to avoid borderline cases. Works reasonably well for upright and rotated hands.
    """
    wrist = landmarks[wrist_idx]
    tip = landmarks[tip_idx]
    pip = landmarks[pip_idx]
    d_tip = _dist(tip, wrist)
    d_pip = _dist(pip, wrist)
    return d_tip > (d_pip + margin)


def fingers_extended_list(landmarks: Sequence[Tuple[float, float]],
                          margin_ratio: float = 0.08) -> list:
    """
    Return list of booleans [thumb, index, middle, ring, pinky] whether each finger
    appears extended. margin_ratio multiplies hand_size to compute an absolute margin.
    """
    size = hand_size(landmarks)
    margin = size * margin_ratio
    results = []
    # Thumb uses TIP vs IP
    results.append(is_finger_extended(landmarks, THUMB_TIP, THUMB_IP, margin=margin))
    # Other fingers use tip vs pip
    results.append(is_finger_extended(landmarks, INDEX_TIP, INDEX_PIP, margin=margin))
    results.append(is_finger_extended(landmarks, MIDDLE_TIP, MIDDLE_PIP, margin=margin))
    results.append(is_finger_extended(landmarks, RING_TIP, RING_PIP, margin=margin))
    results.append(is_finger_extended(landmarks, PINKY_TIP, PINKY_PIP, margin=margin))
    return results


def is_palm_open(landmarks: Sequence[Tuple[float, float]],
                 min_extended: int = 4,
                 spread_ratio: float = 0.25) -> bool:
    """
    Open palm: most fingers extended and fingertips are reasonably spread out.
    - min_extended: minimum number of fingers extended (default 4, allows thumb fold)
    - spread_ratio: min average tip-to-tip distance normalized by hand_size
    """
    extended = fingers_extended_list(landmarks)
    if sum(1 for e in extended if e) < min_extended:
        return False

    # compute average pairwise tip distance to check spread
    tips = [landmarks[i] for i in TIP_INDICES]
    n = len(tips)
    pairwise = 0.0
    count = 0
    for i in range(n):
        for j in range(i + 1, n):
            pairwise += _dist(tips[i], tips[j])
            count += 1
    avg_pair = (pairwise / count) if count else 0.0
    size = hand_size(landmarks)
    return avg_pair > (size * spread_ratio)


def is_fist(landmarks: Sequence[Tuple[float, float]],
            max_extended: int = 0,
            near_wrist_ratio: float = 0.55) -> bool:
    """
    Fist: no fingers extended and fingertips are near the wrist.
    - max_extended: maximum allowed extended fingers (default 0)
    - near_wrist_ratio: maximum allowed average tip->wrist distance relative to hand_size
    """
    extended = fingers_extended_list(landmarks)
    if sum(1 for e in extended if e) > max_extended:
        return False

    wrist = landmarks[WRIST]
    avg_tip_dist = sum(_dist(wrist, landmarks[i]) for i in TIP_INDICES) / len(TIP_INDICES)
    size = hand_size(landmarks)
    return avg_tip_dist < (size * near_wrist_ratio)


def is_thumb_up(landmarks: Sequence[Tuple[float, float]],
                other_finger_max_extended: int = 0) -> bool:
    """
    Thumb-up:
     - thumb appears extended (tip farther from wrist than IP/MCP)
     - other fingers are folded
    This heuristic does not try to determine global 'up' direction (camera-dependent),
    it only checks the thumb vs other fingers.
    """
    extended = fingers_extended_list(landmarks)
    thumb_ext = extended[0]
    others_ext = sum(1 for e in extended[1:] if e)
    return thumb_ext and (others_ext <= other_finger_max_extended)


def is_ok_sign(landmarks: Sequence[Tuple[float, float]],
               close_ratio: float = 0.12,
               other_extended_min: int = 2) -> bool:
    """
    OK sign: thumb tip and index tip are close (pinch circle) while other fingers are extended.
    - close_ratio: max allowed distance between thumb_tip and index_tip relative to hand_size
    - other_extended_min: minimum number of other fingers extended (e.g., middle+ring)
    """
    size = hand_size(landmarks)
    d = _dist(landmarks[THUMB_TIP], landmarks[INDEX_TIP])
    extended = fingers_extended_list(landmarks)
    others = extended[2:]  # middle, ring, pinky
    return (d < size * close_ratio) and (sum(1 for e in others if e) >= other_extended_min)


def is_peace_sign(landmarks: Sequence[Tuple[float, float]],
                  required_extended = (False, True, True, False, False)) -> bool:
    """
    Peace (V) sign: index & middle extended; ring & pinky folded. Thumb often folded or aside.
    required_extended is a tuple of booleans for (thumb, index, middle, ring, pinky)
    """
    extended = fingers_extended_list(landmarks)
    for i, req in enumerate(required_extended):
        if req and not extended[i]:
            return False
        if (not req) and extended[i] and i in (3, 4):  # ring/pinky should be folded
            return False
    # Additional check: index and middle tips should be separated enough
    d_idx_mid = _dist(landmarks[INDEX_TIP], landmarks[MIDDLE_TIP])
    size = hand_size(landmarks)
    return d_idx_mid > (size * 0.12) and extended[1] and extended[2]


def is_pointing(landmarks: Sequence[Tuple[float, float]],
                require_index_extended: bool = True) -> bool:
    """
    Pointing: index extended, other fingers folded (thumb may be either).
    """
    extended = fingers_extended_list(landmarks)
    index_ok = extended[1] if require_index_extended else True
    others_folded = sum(1 for e in (extended[2], extended[3], extended[4]) if e) == 0
    return index_ok and others_folded


def is_pinch(landmarks: Sequence[Tuple[float, float]],
             close_ratio: float = 0.1) -> bool:
    """
    Pinch: thumb tip close to index tip (distance < threshold).
    """
    size = hand_size(landmarks)
    d = _dist(landmarks[THUMB_TIP], landmarks[INDEX_TIP])
    return d < (size * close_ratio)


# Convenience: detect a set of common gestures
def detect_all(landmarks: Sequence[Tuple[float, float]]) -> dict:
    """
    Returns a dict of detections { 'palm_open': bool, 'fist': bool, ... }
    Use these signals together as you like.
    """
    return {
        'palm_open': is_palm_open(landmarks),
        'fist': is_fist(landmarks),
        'thumb_up': is_thumb_up(landmarks),
        'ok': is_ok_sign(landmarks),
        'peace': is_peace_sign(landmarks),
        'pointing': is_pointing(landmarks),
        'pinch': is_pinch(landmarks),
        'extended_flags': fingers_extended_list(landmarks)
    }

# If used as script, show a tiny demo of expected input format
if __name__ == "__main__":
    print("gestures.py loaded - functions ready.")
    print("Example: call detect_all(landmarks) where landmarks is a list of 21 (x,y) points.")


How to use

Run MediaPipe Hands or other hand landmark detector and obtain the 21 landmarks per hand (MediaPipe returns normalized x,y). See MediaPipe docs for how to get landmarks.
Google AI for Developers
+1

Pass the landmarks list to detect_all(landmarks) or any of the predicate functions above. Example:




In [None]:
# pseudo-code
landmarks = [(x0,y0), (x1,y1), ...]   # 21 items from MediaPipe
results = detect_all(landmarks)
if results['palm_open']:
    print("Open palm detected")

In [None]:
"""
pose_gestures.py

Rule-based pose detectors using MediaPipe Pose landmarks.
Requires mediapipe and opencv.

Landmark indices (MediaPipe Pose, 33 points):
    0: nose
    11: left_shoulder
    12: right_shoulder
    13: left_elbow
    14: right_elbow
    15: left_wrist
    16: right_wrist
    23: left_hip
    24: right_hip
    25: left_knee
    26: right_knee
    27: left_ankle
    28: right_ankle
"""

from typing import Sequence, Tuple
import math

Point = Tuple[float, float]

# landmark indices
NOSE = 0
LEFT_SHOULDER, RIGHT_SHOULDER = 11, 12
LEFT_ELBOW, RIGHT_ELBOW = 13, 14
LEFT_WRIST, RIGHT_WRIST = 15, 16
LEFT_HIP, RIGHT_HIP = 23, 24
LEFT_KNEE, RIGHT_KNEE = 25, 26
LEFT_ANKLE, RIGHT_ANKLE = 27, 28


def _dist(a: Point, b: Point) -> float:
    ax, ay = a; bx, by = b
    return math.hypot(ax - bx, ay - by)


def body_size(landmarks: Sequence[Point]) -> float:
    """
    Use shoulder width as a reference scale.
    """
    ls, rs = landmarks[LEFT_SHOULDER], landmarks[RIGHT_SHOULDER]
    return _dist(ls, rs)


def is_hands_up(landmarks: Sequence[Point]) -> bool:
    """
    True if both wrists are above shoulders.
    """
    ls, rs = landmarks[LEFT_SHOULDER], landmarks[RIGHT_SHOULDER]
    lw, rw = landmarks[LEFT_WRIST], landmarks[RIGHT_WRIST]

    return (lw[1] < ls[1]) and (rw[1] < rs[1])


def is_t_pose(landmarks: Sequence[Point], tolerance: float = 0.15) -> bool:
    """
    Arms extended sideways (T-pose).
    Heuristic: wrists are roughly at shoulder height, far from torso.
    """
    size = body_size(landmarks)
    ls, rs = landmarks[LEFT_SHOULDER], landmarks[RIGHT_SHOULDER]
    lw, rw = landmarks[LEFT_WRIST], landmarks[RIGHT_WRIST]

    # y-level check
    y_ok = (abs(lw[1] - ls[1]) < tolerance) and (abs(rw[1] - rs[1]) < tolerance)

    # distance check (wrists away from shoulders)
    x_ok = (abs(lw[0] - ls[0]) > size * 0.8) and (abs(rw[0] - rs[0]) > size * 0.8)

    return y_ok and x_ok


def is_squat(landmarks: Sequence[Point], knee_ratio: float = 0.8) -> bool:
    """
    Squat: hips lowered close to knee height.
    """
    lh, rh = landmarks[LEFT_HIP], landmarks[RIGHT_HIP]
    lk, rk = landmarks[LEFT_KNEE], landmarks[RIGHT_KNEE]

    avg_hip_y = (lh[1] + rh[1]) / 2
    avg_knee_y = (lk[1] + rk[1]) / 2

    return avg_hip_y > (avg_knee_y * knee_ratio)


def is_standing(landmarks: Sequence[Point], straight_ratio: float = 0.3) -> bool:
    """
    Standing straight: hips above knees, knees above ankles.
    """
    lh, rh = landmarks[LEFT_HIP], landmarks[RIGHT_HIP]
    lk, rk = landmarks[LEFT_KNEE], landmarks[RIGHT_KNEE]
    la, ra = landmarks[LEFT_ANKLE], landmarks[RIGHT_ANKLE]

    return (lh[1] < lk[1] < la[1]) and (rh[1] < rk[1] < ra[1])


def detect_all(landmarks: Sequence[Point]) -> dict:
    """
    Returns dict of detected poses.
    Input landmarks should be normalized (0–1) from MediaPipe.
    """
    return {
        "hands_up": is_hands_up(landmarks),
        "t_pose": is_t_pose(landmarks),
        "squat": is_squat(landmarks),
        "standing": is_standing(landmarks)
    }
