# 해당 코드들은 자동화 최적화가 되어 있지 않음
# 아직은 1개씩만 돌려보고 결과를 확인하기 위한 코드입니다.

In [48]:
from PIL import Image
from collections import deque
import os
import numpy as np
import pandas as pd
import json
import cv2
import mediapipe as mp
import time

In [2]:
target_files = []
json_paths = []
for root, dirs, files in os.walk("."):
    for fname in files:
        if fname.endswith("rgb_face.mp4"):
            target_files.append(os.path.join(root, fname))
        if fname.endswith("drowsiness.json"):
            json_paths.append(os.path.join(root, fname))

print(target_files)
print(json_paths)

['.\\dmd\\gA\\1\\s5\\gA_1_s5_2019-03-14T14;26;17+01;00_rgb_face.mp4', '.\\dmd\\gA\\2\\s5\\gA_2_s5_2019-03-13T09;19;23+01;00_rgb_face.mp4', '.\\dmd\\gA\\3\\s5\\gA_3_s5_2019-03-13T09;36;25+01;00_rgb_face.mp4', '.\\dmd\\gA\\4\\s5\\gA_4_s5_2019-03-13T10;56;52+01;00_rgb_face.mp4', '.\\dmd\\gA\\5\\s5\\gA_5_s5_2019-03-13T09;06;49+01;00_rgb_face.mp4', '.\\dmd\\gB\\10\\s5\\gB_10_s5_2019-03-12T10;35;20+01;00_rgb_face.mp4', '.\\dmd\\gB\\6\\s5\\gB_6_s5_2019-03-13T13;37;11+01;00_rgb_face.mp4', '.\\dmd\\gB\\7\\s5\\gB_7_s5_2019-03-13T13;55;52+01;00_rgb_face.mp4', '.\\dmd\\gB\\8\\s5\\gB_8_s5_2019-03-13T14;10;09+01;00_rgb_face.mp4', '.\\dmd\\gB\\9\\s5\\gB_9_s5_2019-03-07T16;31;48+01;00_rgb_face.mp4']
['.\\dmd\\gA\\1\\s5\\gA_1_s5_2019-03-14T14;26;17+01;00_rgb_ann_drowsiness.json', '.\\dmd\\gA\\2\\s5\\gA_2_s5_2019-03-13T09;19;23+01;00_rgb_ann_drowsiness.json', '.\\dmd\\gA\\3\\s5\\gA_3_s5_2019-03-13T09;36;25+01;00_rgb_ann_drowsiness.json', '.\\dmd\\gA\\4\\s5\\gA_4_s5_2019-03-13T10;56;52+01;00_rgb_ann_drow

In [None]:
OUTPUT_VIDEO = "labeled_output.mp4"              # 출력 비디오 파일명
OUTPUT_CSV = "per_frame_labels.csv"              # 출력 CSV 파일명

CALIB_SECONDS = 2.0       # 초기 캘리브레이션 구간(초)
FPS_FALLBACK = 30.0       # FPS 정보가 없을 때 기본값
SMOOTH_WIN = 5            # EAR/MAR 이동평균 윈도우
BLINK_MAX_FRAMES = 8      # blink로 볼 수 있는 최대 닫힘 프레임 길이
YAWN_MIN_FRAMES = 45      # 하품으로 간주할 최소 프레임 길이
HAND_MOUTH_DIST_PX = 80   # 손가락 포인트와 입 중심 간 근접 판정 거리(픽셀)

# 상태 머신 임계치 비율 (캘리브레이션 결과에 곱해 사용)
EYE_CLOSE_RATIO = 0.85    # 눈감김 임계치: EAR_low = median(EAR_calib)*EYE_CLOSE_RATIO
EYE_OPEN_RATIO  = 1.05    # 눈뜸 임계치: EAR_high = median(EAR_calib)*EYE_OPEN_RATIO
MOUTH_YAWN_RATIO = 1.25   # 하품 임계치: MAR_high = median(MAR_calib)*MOUTH_YAWN_RATIO

In [None]:
# FaceMesh / Hands 초기화
mp_face = mp.solutions.face_mesh
mp_hands = mp.solutions.hands
mp_draw = mp.solutions.drawing_utils
mp_styles = mp.solutions.drawing_styles

# FaceMesh 눈/입 계산용 랜드마크 인덱스 (MediaPipe FaceMesh)
# EAR: (상하 거리 합) / (좌우 거리)  -- 관례적 정의
LEFT_EYE = [33, 160, 158, 133, 153, 144]   # [left, top1, top2, right, bottom1, bottom2]
RIGHT_EYE = [362, 385, 387, 263, 373, 380]
# MAR: (상하 거리) / (좌우 거리)
MOUTH_HORZ = (61, 291)    # 좌우 외측 입꼬리
MOUTH_VERT = (13, 14)     # 상하(안쪽 입술 중앙)

# 상태 관련 버퍼
ear_buf = deque(maxlen=SMOOTH_WIN)
mar_buf = deque(maxlen=SMOOTH_WIN)

# 눈 상태 머신
EYE_OPEN = 0
EYE_CLOSE = 1
EYE_OPENING = 2
EYE_CLOSING = 3
EYE_BLINK = 4

eye_state = EYE_OPEN    # 초기 가정
close_count = 0         # 연속 닫힘 프레임 수(블링크 판정용)

# 하품 상태
YAWN_NONE = 0
YAWN_WITH_HAND = 1
YAWN_WITHOUT_HAND = 2

yawn_state = YAWN_NONE
yawn_count = 0

# 캘리브레이션 샘플
ear_samples = []
mar_samples = []

In [None]:
# 각종 상태 및 라벨을 판별하는 함수들 + ROI
def euclid(p1, p2):
    return np.linalg.norm(np.array(p1) - np.array(p2))

def eye_aspect_ratio(landmarks, eye_idx, w, h):
    l = landmarks[eye_idx[0]]; r = landmarks[eye_idx[3]]
    t1 = landmarks[eye_idx[1]]; t2 = landmarks[eye_idx[2]]
    b1 = landmarks[eye_idx[4]]; b2 = landmarks[eye_idx[5]]
    l = (l.x*w, l.y*h); r = (r.x*w, r.y*h)
    t1 = (t1.x*w, t1.y*h); t2 = (t2.x*w, t2.y*h)
    b1 = (b1.x*w, b1.y*h); b2 = (b2.x*w, b2.y*h)
    vertical = (euclid(t1, b1) + euclid(t2, b2)) / 2.0
    horizontal = euclid(l, r) + 1e-6
    return vertical / horizontal

def mouth_aspect_ratio(landmarks, w, h):
    L = landmarks[MOUTH_HORZ[0]]; R = landmarks[MOUTH_HORZ[1]]
    U = landmarks[MOUTH_VERT[0]]; D = landmarks[MOUTH_VERT[1]]
    L = (L.x*w, L.y*h); R = (R.x*w, R.y*h)
    U = (U.x*w, U.y*h); D = (D.x*w, D.y*h)
    horizontal = euclid(L, R) + 1e-6
    vertical = euclid(U, D)
    return vertical / horizontal, ((L[0]+R[0])/2, (U[1]+D[1])/2)

def moving_avg(queue, k):
    arr = np.array(queue, dtype=np.float32)
    if len(arr)==0: return None
    if len(arr) < k:
        return float(np.mean(arr))
    return float(np.mean(arr[-k:]))

def draw_roi(frame, pts, color=(0,255,0), thickness=2):
    xs = [p[0] for p in pts]; ys = [p[1] for p in pts]
    x1, y1, x2, y2 = int(min(xs)), int(min(ys)), int(max(xs)), int(max(ys))
    cv2.rectangle(frame, (x1, y1), (x2, y2), color, thickness)
    return (x1, y1, x2, y2)

def is_hand_near_mouth(hand_landmarks_list, mouth_center, max_dist_px, w, h):
    if hand_landmarks_list is None: return False
    cx, cy = mouth_center
    for hand in hand_landmarks_list:
        for lm in hand.landmark:
            px, py = lm.x*w, lm.y*h
            if euclid((px, py), (cx, cy)) <= max_dist_px:
                return True
    return False

In [56]:
video_path = target_files[0]
cap = cv2.VideoCapture(video_path)

fps = cap.get(cv2.CAP_PROP_FPS)
if fps <= 1e-2: fps = FPS_FALLBACK
width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) if cap.get(cv2.CAP_PROP_FRAME_COUNT)>0 else None

fourcc = cv2.VideoWriter_fourcc(*"mp4v")
out = cv2.VideoWriter("./output/labeled_output.mp4", fourcc, fps, (width, height))

logs = []

# 캘리브 구간 프레임 수
calib_frames = int(CALIB_SECONDS * fps)

In [57]:
# MediaPipe 컨텍스트
start = time.time()
with mp_face.FaceMesh(
    static_image_mode=False,
    refine_landmarks=True,
    max_num_faces=1,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
) as face_mesh, mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=2,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
) as hands:

    frame_idx = 0

    # 1) 캘리브레이션
    while True:
        ret, frame = cap.read()
        if not ret: break
        frame_idx += 1
        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        face_res = face_mesh.process(rgb)

        if face_res.multi_face_landmarks:
            face_landmarks = face_res.multi_face_landmarks[0].landmark
            ear_l = eye_aspect_ratio(face_landmarks, LEFT_EYE, width, height)
            ear_r = eye_aspect_ratio(face_landmarks, RIGHT_EYE, width, height)
            ear = (ear_l + ear_r) / 2.0
            mar, _ = mouth_aspect_ratio(face_landmarks, width, height)
            ear_samples.append(ear)
            mar_samples.append(mar)

        if frame_idx >= calib_frames:
            break

    # 임계치 계산
    if len(ear_samples) >= 5:
        ear_med = float(np.median(ear_samples))
        EAR_LOW  = ear_med * EYE_CLOSE_RATIO
        EAR_HIGH = ear_med * EYE_OPEN_RATIO
    else:
        EAR_LOW, EAR_HIGH = 0.18, 0.26  # 안전 기본값 (얼굴 크기/거리 따라 다를 수 있음)

    if len(mar_samples) >= 5:
        mar_med = float(np.median(mar_samples))
        MAR_HIGH = mar_med * MOUTH_YAWN_RATIO
    else:
        MAR_HIGH = 0.6  # 안전 기본값

    # 2) 본 처리 루프 (다시 처음부터)
    cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
    frame_idx = 0

    while True:
        ret, frame = cap.read()
        if not ret: break
        frame_idx += 1

        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        face_res = face_mesh.process(rgb)
        hands_res = hands.process(rgb)

        label_id = 0
        label_str = "eyes_state/open"

        if face_res.multi_face_landmarks:
            face_landmarks = face_res.multi_face_landmarks[0].landmark

            # EAR/MAR
            ear_l = eye_aspect_ratio(face_landmarks, LEFT_EYE, width, height)
            ear_r = eye_aspect_ratio(face_landmarks, RIGHT_EYE, width, height)
            ear = (ear_l + ear_r) / 2.0
            mar, mouth_center = mouth_aspect_ratio(face_landmarks, width, height)

            ear_buf.append(ear)
            mar_buf.append(mar)
            ear_s = moving_avg(ear_buf, SMOOTH_WIN)
            mar_s = moving_avg(mar_buf, SMOOTH_WIN)

            # 눈 상태 머신 업데이트
            prev_state = eye_state
            if ear_s is None:
                eye_state = EYE_OPEN
            else:
                # opening/closing은 EAR 변화율로 판정(최근 3프레임)
                if len(ear_buf) >= 3:
                    ear_deriv = ear_buf[-1] - ear_buf[-3]
                else:
                    ear_deriv = 0.0

                is_closed = ear_s < EAR_LOW
                is_opened = ear_s > EAR_HIGH

                if is_closed:
                    if prev_state != EYE_CLOSE:
                        eye_state = EYE_CLOSING if ear_deriv < 0 else EYE_CLOSE
                    else:
                        eye_state = EYE_CLOSE
                    close_count += 1
                elif is_opened:
                    if prev_state != EYE_OPEN:
                        eye_state = EYE_OPENING if ear_deriv > 0 else EYE_OPEN
                    else:
                        eye_state = EYE_OPEN
                    # blink 판정: 짧게 닫혔다가 열림
                    if 0 < close_count <= BLINK_MAX_FRAMES:
                        eye_state = EYE_BLINK
                    close_count = 0
                else:
                    # 임계대역 사이: 파형 기울기 기준으로 opening/closing
                    if ear_deriv > 0:
                        eye_state = EYE_OPENING
                    elif ear_deriv < 0:
                        eye_state = EYE_CLOSING
                    # 닫힘 유지 중이었다면 카운트 증가
                    if prev_state in (EYE_CLOSE, EYE_CLOSING):
                        close_count += 1
                    else:
                        close_count = 0

            # 하품 상태 판별
            hand_near = is_hand_near_mouth(
                hands_res.multi_hand_landmarks if hands_res else None,
                mouth_center, HAND_MOUTH_DIST_PX, width, height
            )
            if mar_s is not None and mar_s > MAR_HIGH:
                yawn_count += 1
                yawn_state = YAWN_WITH_HAND if hand_near else YAWN_WITHOUT_HAND
            else:
                yawn_state = YAWN_NONE
                yawn_count = 0

            # 최종 라벨 우선순위: 하품 > 눈 상태/눈 깜빡임
            if yawn_state != YAWN_NONE and yawn_count >= YAWN_MIN_FRAMES:
                if yawn_state == YAWN_WITH_HAND:
                    label_id, label_str = 5, "yawning/Yawning with hand"
                else:
                    label_id, label_str = 6, "yawning/Yawning without hand"
            else:
                if eye_state == EYE_OPEN:
                    label_id, label_str = 0, "eyes_state/open"
                elif eye_state == EYE_CLOSE:
                    label_id, label_str = 1, "eyes_state/close"
                elif eye_state == EYE_OPENING:
                    label_id, label_str = 2, "eyes_state/opening"
                elif eye_state == EYE_CLOSING:
                    label_id, label_str = 3, "eyes_state/closing"
                elif eye_state == EYE_BLINK:
                    label_id, label_str = 4, "blinks/blinking"

            # 얼굴 랜드마크 시각화 드로잉
            mp_draw.draw_landmarks(
                frame,
                face_res.multi_face_landmarks[0],
                mp_face.FACEMESH_TESSELATION,
                landmark_drawing_spec=None,
                connection_drawing_spec=mp_styles.get_default_face_mesh_tesselation_style()
            )
            # 눈/입 ROI 박스
            def idx_to_pts(idxs):
                return [(int(face_landmarks[i].x*width), int(face_landmarks[i].y*height)) for i in idxs]

            left_eye_pts = idx_to_pts(LEFT_EYE)
            right_eye_pts = idx_to_pts(RIGHT_EYE)
            mouth_pts = idx_to_pts([MOUTH_HORZ[0], MOUTH_HORZ[1], MOUTH_VERT[0], MOUTH_VERT[1]])
            draw_roi(frame, left_eye_pts, (0,255,0), 2)
            draw_roi(frame, right_eye_pts, (0,255,0), 2)
            draw_roi(frame, mouth_pts, (255,0,0), 2)

            # 손 랜드마크
            if hands_res and hands_res.multi_hand_landmarks:
                for hlm in hands_res.multi_hand_landmarks:
                    mp_draw.draw_landmarks(frame, hlm, mp_hands.HAND_CONNECTIONS)

            # 라벨/지표 텍스트
            cv2.putText(frame, f"Label {label_id}: {label_str}", (20, 40),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 200, 255), 2)
            if ear_s is not None:
                cv2.putText(frame, f"EAR:{ear_s:.3f}  (low:{EAR_LOW:.3f} high:{EAR_HIGH:.3f})",
                            (20, 70), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
            if mar_s is not None:
                cv2.putText(frame, f"MAR:{mar_s:.3f}  (yawn>{MAR_HIGH:.3f})",
                            (20, 100), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 200, 0), 2)
            if yawn_state != YAWN_NONE:
                cv2.putText(frame, f"HandNearMouth: {bool(hand_near)}  YawnFrames:{yawn_count}",
                            (20, 130), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 180, 255), 2)
        else:
            # 얼굴 미검출 시 기본 라벨 유지(열림 가정)
            label_id, label_str = 0, "eyes_state/open"
            cv2.putText(frame, "Face not detected", (20, 40),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 0, 255), 2)

        # 프레임 로그
        logs.append({
            "frame": frame_idx,
            "time_sec": frame_idx / fps,
            "label_id": label_id,
            "label_name": label_str,
            "EAR": float(ear_buf[-1]) if len(ear_buf)>0 else np.nan,
            "MAR": float(mar_buf[-1]) if len(mar_buf)>0 else np.nan
        })

        out.write(frame)

print(f"{time.time()-start:.4f} sec")

189.7125 sec


In [58]:
cap.release()
out.release()
cv2.destroyAllWindows()

pd.DataFrame(logs).to_csv("./output/per_frame_labels.csv", index=False)
print(f"Saved video to: {OUTPUT_VIDEO}")
print(f"Saved per-frame CSV to: {OUTPUT_CSV}")

Saved video to: labeled_output.mp4
Saved per-frame CSV to: per_frame_labels.csv
