In [5]:
# Install dependencies. 
# We need specific versions to ensure stability.
!pip install mediapipe==0.10.31 opencv-python pycaw comtypes numpy




[notice] A new release of pip available: 22.2.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
import urllib.request
import os

# Download the specific Hand Landmarker model for 0.10.31
url = "https://storage.googleapis.com/mediapipe-models/hand_landmarker/hand_landmarker/float16/1/hand_landmarker.task"
model_path = "hand_landmarker.task"

if not os.path.exists(model_path):
    urllib.request.urlretrieve(url, model_path)
    print("Model downloaded successfully.")
else:
    print("Model already exists.")

Model already exists.


In [10]:
import cv2
import mediapipe as mp
import numpy as np
import time
from ctypes import cast, POINTER
from comtypes import CLSCTX_ALL
# We import the raw interface definitions
from pycaw.pycaw import AudioUtilities, IAudioEndpointVolume
from comtypes import CoCreateInstance, GUID, CLSCTX_ALL
from ctypes import cast, POINTER
from pycaw.pycaw import IAudioEndpointVolume, IMMDeviceEnumerator

In [11]:


# CONSTANTS for Windows Audio (Core Audio APIs)
# We manually define the Class ID for the Device Enumerator
CLSID_MMDeviceEnumerator = GUID('{BCDE0395-E52F-467C-8E3D-C4579291692E}')
EDataFlow_eRender = 0      # Audio Output
ERole_eMultimedia = 1      # Multimedia Role

In [12]:
# 1. Create the Device Enumerator directly
device_enumerator = CoCreateInstance(
    CLSID_MMDeviceEnumerator,
    IMMDeviceEnumerator,
    CLSCTX_ALL
)

# 2. Get the Default Audio Endpoint (Speakers)
# We bypass "GetSpeakers()" and ask for the Render device directly
speakers = device_enumerator.GetDefaultAudioEndpoint(EDataFlow_eRender, ERole_eMultimedia)

# 3. Activate the Interface
# This is the line that was failing before. Now we are calling it on the raw COM object.
interface = speakers.Activate(IAudioEndpointVolume._iid_, CLSCTX_ALL, None)

In [13]:
# Cast the interface to a pointer we can use
volume = cast(interface, POINTER(IAudioEndpointVolume))

# Get the volume range (dB)
vol_range = volume.GetVolumeRange()
min_vol = vol_range[0]
max_vol = vol_range[1]

print(f"Success. Volume Range: {min_vol} dB to {max_vol} dB")

Success. Volume Range: -65.25 dB to 0.0 dB


In [14]:
# Create the HandLandmarker options
BaseOptions = mp.tasks.BaseOptions
HandLandmarker = mp.tasks.vision.HandLandmarker
HandLandmarkerOptions = mp.tasks.vision.HandLandmarkerOptions
VisionRunningMode = mp.tasks.vision.RunningMode

# Configure options for Live Video
options = HandLandmarkerOptions(
    base_options=BaseOptions(model_asset_path='hand_landmarker.task'),
    running_mode=VisionRunningMode.VIDEO,
    num_hands=1,
    min_hand_detection_confidence=0.5
)

In [15]:
def map_range(value, in_min, in_max, out_min, out_max):
    """
    Linearly maps a value from input range to output range.
    """
    return (value - in_min) * (out_max - out_min) / (in_max - in_min) + out_min

In [16]:
# Initialize Webcam
cap = cv2.VideoCapture(0)

# Create the landmarker instance
landmarker = HandLandmarker.create_from_options(options)

# Track time for timestamps (required by Tasks API)
start_time = time.time()

In [None]:
while cap.isOpened():
    success, frame = cap.read()
    if not success: break

    # Convert to RGB for MediaPipe
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb_frame)

    # Calculate timestamp in ms
    timestamp_ms = int((time.time() - start_time) * 1000)
    
    # Detect Async (Video Mode)
    detection_result = landmarker.detect_for_video(mp_image, timestamp_ms)


    if detection_result.hand_landmarks:
        hand = detection_result.hand_landmarks[0]
        
        # Get Thumb (4) and Index (8) coordinates
        # Coordinates are normalized (0.0 to 1.0), so multiply by width/height
        h, w, _ = frame.shape
        x1, y1 = int(hand[4].x * w), int(hand[4].y * h)
        x2, y2 = int(hand[8].x * w), int(hand[8].y * h)

        # Draw visuals
        cv2.circle(frame, (x1, y1), 10, (255, 0, 0), cv2.FILLED)
        cv2.circle(frame, (x2, y2), 10, (255, 0, 0), cv2.FILLED)
        cv2.line(frame, (x1, y1), (x2, y2), (255, 0, 0), 3)
      
        # Calculate distance
        length = np.hypot(x2 - x1, y2 - y1)

        # Mute if fingers are pinched (distance < 25)
        if length < 25:
            volume.SetMasterVolumeLevel(min_vol, None)
            cv2.putText(frame, "MUTED", (50, 50), cv2.FONT_HERSHEY_PLAIN, 2, (0, 0, 255), 2)
        else:
            # Map distance (approx 25px - 200px) to volume range
            # We use np.interp for safe interpolation (clamps values)
            target_vol = np.interp(length, [25, 200], [min_vol, max_vol])
            volume.SetMasterVolumeLevel(target_vol, None)

    # Show frame
    cv2.imshow('Volume Control', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'): break

# Cleanup
cap.release()
cv2.destroyAllWindows()