# 控制音量-摄像头

同济子豪兄 2023-4-14

预备知识：https://www.bilibili.com/video/BV1x44y127Yu

## 导入工具包

In [12]:
import cv2
import numpy as np
from PIL import Image
import mediapipe as mp

# 导入python绘图matplotlib
import matplotlib.pyplot as plt
# 使用ipython的魔法方法，将绘制出的图像直接嵌入在notebook单元格中
%matplotlib inline

In [13]:
# 修改电脑音量函数-Mac电脑
import osascript
def modify_volume(target_volume):
    vol = 'set volume output volume ' + str(target_volume)
    result = osascript.osascript(vol)
    return None

In [14]:
# 像素距离与音量的范围对应关系
distance_base = [30, 300]
volume_base = [0, 100]

## 导入模型

In [15]:
# 导入solution
mp_hands = mp.solutions.hands

# 导入模型
hands = mp_hands.Hands(static_image_mode=True,        # 是静态图片还是连续视频帧
                       max_num_hands=4,                # 最多检测几只手
                       min_detection_confidence=0.4,   # 置信度阈值，过滤低于该阈值的预测结果
                       min_tracking_confidence=0.5)    # 追踪阈值

# 导入绘图函数
mpDraw = mp.solutions.drawing_utils 

## 可视化配置

In [16]:
# 每个关键点的颜色、半径、是否填充
color_map = {
    0:{'color':(0,0,255), 'radius':10, 'thickness':-1},      # 手腕根部
    1:{'color':(16,144,247), 'radius':10, 'thickness':-1},
    2:{'color':(1,240,255), 'radius':10, 'thickness':-1},
    3:{'color':(140,47,240), 'radius':10, 'thickness':-1},
    4:{'color':(223,155,60), 'radius':15, 'thickness':2},    # 拇指指尖
    5:{'color':(16,144,247), 'radius':10, 'thickness':-1},
    6:{'color':(1,240,255), 'radius':10, 'thickness':-1},
    7:{'color':(140,47,240), 'radius':10, 'thickness':-1},
    8:{'color':(0,0,255), 'radius':15, 'thickness':2},       # 食指指尖
    9:{'color':(16,144,247), 'radius':10, 'thickness':-1},
    10:{'color':(1,240,255), 'radius':10, 'thickness':-1},
    11:{'color':(140,47,240), 'radius':10, 'thickness':-1},
    12:{'color':(223,155,60), 'radius':10, 'thickness':-1},
    13:{'color':(16,144,247), 'radius':10, 'thickness':-1},
    14:{'color':(1,240,255), 'radius':10, 'thickness':-1},
    15:{'color':(140,47,240), 'radius':10, 'thickness':-1},
    16:{'color':(223,155,60), 'radius':10, 'thickness':-1},
    17:{'color':(16,144,247), 'radius':10, 'thickness':-1},
    18:{'color':(1,240,255), 'radius':10, 'thickness':-1},
    19:{'color':(140,47,240), 'radius':10, 'thickness':-1},
    20:{'color':(223,155,60), 'radius':10, 'thickness':-1}
}

In [17]:
scaler = 1 # 字体大小因子

## 逐帧处理函数

运行慢的原因：修改电脑音量`modify_volume(target_volume)`

In [18]:
# 逐帧处理函数，默认不进行任何处理，直接将摄像头捕获的画面写入视频帧
def process_frame(img_bgr):
    
    global distance_base, volume_base, scaler
    
    # 记录该帧开始处理的时间
    start_time = time.time()
    
    # 获取图像宽高
    h, w = img_bgr.shape[0], img_bgr.shape[1]
    
    # 水平镜像翻转图像，使图中左右手与真实左右手对应
    # 参数 1：水平翻转，0：竖直翻转，-1：水平和竖直都翻转
    img_bgr = cv2.flip(img_bgr, 1)
    
    # BGR 转 RGB
    img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
    
    # 将RGB图像输入模型，获取预测结果
    results = hands.process(img_rgb)
    # print('检测到 {} 只手'.format(len(results.multi_hand_landmarks)))
    
    # 绘制每只手的骨架及关键点
    if results.multi_hand_landmarks: # 如果有检测到手

        for hand_idx in range(len(results.multi_hand_landmarks)): # 遍历每只手

            # 获取该手的21个关键点坐标
            hand_21 = results.multi_hand_landmarks[hand_idx]

            # 可视化关键点及骨架连线
            mpDraw.draw_landmarks(img_bgr, hand_21, mp_hands.HAND_CONNECTIONS)

            # 记录左右手信息
            # temp_handness = results.multi_handedness[hand_idx].classification[0].label

            # 获取手腕根部深度坐标
            cz0 = hand_21.landmark[0].z

            for i in range(21): # 遍历该手的21个关键点

                # 获取该关键点的XYZ坐标
                cx = int(hand_21.landmark[i].x * w)
                cy = int(hand_21.landmark[i].y * h)
                cz = hand_21.landmark[i].z
                # 计算该关键点，相对手腕根部关键点，是靠近还是远离摄像头
                depth_z = cz0 - cz

                # 用圆的半径反映深度大小
                radius = max(int(color_map[i]['radius'] * (1 + depth_z*5)), 0)

                # 从可视化配置表中获取该关键点的颜色、线宽（是否填充）
                color = color_map[i]['color']
                thickness =color_map[i]['thickness']
                # 画圆
                img_bgr = cv2.circle(img_bgr, (cx,cy), radius, color, thickness)
             
    # 索引为 hand_idx 的手，控制音量
    if results.multi_hand_landmarks: # 如果有检测到手
        # 获取索引为hand_idx的手，绘制连线
        hand_idx = 0

        hand_21 = results.multi_hand_landmarks[hand_idx]

        # 获取食指指尖坐标
        index_tip_x = int(hand_21.landmark[4].x * w)
        index_tip_y = int(hand_21.landmark[4].y * h)

        # 获取大拇指指尖坐标
        thumb_tip_x = int(hand_21.landmark[8].x * w)
        thumb_tip_y = int(hand_21.landmark[8].y * h)

        # 连接大拇指指尖和食指指尖
        # 图，两个点的坐标，颜色，线宽
        img_bgr = cv2.line(img_bgr, (index_tip_x, index_tip_y), (thumb_tip_x, thumb_tip_y), color=(1,240,255), thickness=4)

        # 计算像素距离
        distance = np.linalg.norm([index_tip_x-thumb_tip_x, index_tip_y-thumb_tip_y])
        distance_str = 'Distance {:.2f}'.format(distance)

        # 在图像上写距离数值，参数依次为：图片，添加的文字，左上角坐标，字体，字体大小，颜色，字体粗细
        img_bgr = cv2.putText(img_bgr, distance_str, (25 * scaler, 120 * scaler), cv2.FONT_HERSHEY_SIMPLEX, 1.25 * scaler, (255, 0, 255), 2 * scaler)

        # 通过插值，计算当前像素距离对应的音量
        target_volume = int(np.interp(distance, distance_base, volume_base))
        # 修改电脑音量
        modify_volume(target_volume)

        # 可视化音量条
        # 音量条外框
        bar_frame_top_left_x = 25
        bar_frame_top_left_y = 180
        bar_frame_bottom_right_x = 400
        bar_frame_bottom_right_y = 215
        # 画音量条外框
        cv2.rectangle(img_bgr, [bar_frame_top_left_x, bar_frame_top_left_y], [bar_frame_bottom_right_x, bar_frame_bottom_right_y], (0, 255, 0), 3)
        # 画音量条填充
        bar_bottom_right_x = int(bar_frame_top_left_x + (bar_frame_bottom_right_x - bar_frame_top_left_x) * (target_volume/100))
        cv2.rectangle(img_bgr, (bar_frame_top_left_x, bar_frame_top_left_y), (bar_bottom_right_x, bar_frame_bottom_right_y), (0, 255, 0), cv2.FILLED)
        # 写音量数值
        cv2.putText(img_bgr, f'{target_volume} %', (bar_frame_bottom_right_x+20, bar_frame_bottom_right_y), cv2.FONT_HERSHEY_COMPLEX, 2, (0, 255, 0), 3)
        # 记录该帧处理完毕的时间
    
    end_time = time.time()
    # 计算每秒处理图像帧数FPS
    FPS = 1/(end_time - start_time)

    # 在画面上写字：图片，字符串，左上角坐标，字体，字体大小，颜色，字体粗细
    scaler = 1 # 文字大小
    FPS_string = 'FPS  '+str(int(FPS)) # 写在画面上的字符串
    img_bgr = cv2.putText(img_bgr, FPS_string, (25 * scaler, 60 * scaler), cv2.FONT_HERSHEY_SIMPLEX, 1.25 * scaler, (255, 0, 255), 2 * scaler)

    img = img_bgr
    return img

## 调用摄像头实时画面

In [19]:
# 调用摄像头逐帧实时处理模板
# 不需修改任何代码，只需修改process_frame函数即可
# 同济子豪兄 2021-7-8

# 导入opencv-python
import cv2
import time

# 获取摄像头，传入0表示获取系统默认摄像头
cap = cv2.VideoCapture(0)

# 打开cap
cap.open(0)

# 无限循环，直到break被触发
while cap.isOpened():
    
    # 获取画面
    success, frame = cap.read()
    
    if not success: # 如果获取画面不成功，则退出
        print('获取画面不成功，退出')
        break
    
    ## 逐帧处理
    frame = process_frame(frame)
    
    # 展示处理后的三通道图像
    cv2.imshow('my_window',frame)
    
    key_pressed = cv2.waitKey(60) # 每隔多少毫秒毫秒，获取键盘哪个键被按下
    # print('键盘上被按下的键：', key_pressed)

    if key_pressed in [ord('q'),27]: # 按键盘上的q或esc退出（在英文输入法下）
        break
    
# 关闭摄像头
cap.release()

# 关闭图像窗口
cv2.destroyAllWindows()