In [4]:
import cv2
from ultralytics import YOLO
import mediapipe as mp

person_model = YOLO("yolov8n.pt")        # детектор людей
face_model = mp.solutions.face_detection.FaceDetection(model_selection=1, min_detection_confidence=0.5)

tracker = person_model.track

cap = cv2.VideoCapture("vid.mp4")

while True:
    ret, frame = cap.read()
    if not ret:
        break

    track_results = person_model.track(frame, classes=[0], persist=True)[0]   # class=0 = person

    annotated = track_results.plot()

    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    face_results = face_model.process(rgb_frame)

    if face_results.detections:
        for det in face_results.detections:
            bbox = det.location_data.relative_bounding_box
            h, w, _ = frame.shape
            x1 = int(bbox.xmin * w)
            y1 = int(bbox.ymin * h)
            x2 = int((bbox.xmin + bbox.width) * w)
            y2 = int((bbox.ymin + bbox.height) * h)
            cv2.rectangle(annotated, (x1, y1), (x2, y2), (0, 255, 0), 2)

    cv2.imshow("People + Faces + Tracking", annotated)

    if cv2.waitKey(1) & 0xFF == 27:
        break

cap.release()
cv2.destroyAllWindows()


AttributeError: module 'mediapipe' has no attribute 'solutions'

In [5]:
import cv2
from ultralytics import YOLO
import dlib
import numpy as np
from imutils import face_utils

person_model = YOLO("yolov8n.pt")  # или yolov11n.pt

predictor_path = "shape_predictor_68_face_landmarks.dat"
dlib_detector = dlib.get_frontal_face_detector()
predictor = dlib.shape_predictor(predictor_path)


def eye_aspect_ratio(eye):
    A = np.linalg.norm(eye[1] - eye[5])
    B = np.linalg.norm(eye[2] - eye[4])
    C = np.linalg.norm(eye[0] - eye[3])
    return (A + B) / (2.0 * C)


def is_smiling(landmarks):
    left_corner = landmarks[48]
    right_corner = landmarks[54]
    top_lip = landmarks[51]
    bottom_lip = landmarks[57]

    middle_point = ((left_corner + right_corner) / 2).astype(int)

    mouth_width = np.linalg.norm(left_corner - right_corner)
    mouth_height = np.linalg.norm(top_lip - bottom_lip)

    mouth_center_y = (top_lip[1] + bottom_lip[1]) / 2
    left_lift = mouth_center_y - left_corner[1]
    right_lift = mouth_center_y - right_corner[1]

    smile_metric = (left_lift + right_lift)/2 + (mouth_width / mouth_height)
    return smile_metric > 7.0, middle_point


cap = cv2.VideoCapture("vid_3.mp4")

while True:
    ret, frame = cap.read()
    if not ret:
        break

    results = person_model.track(frame, classes=[0], persist=True)[0]  # class=0 = person

    annotated = results.plot()

    for box in results.boxes:
        x1, y1, x2, y2 = map(int, box.xyxy[0])
        track_id = int(box.id) if box.id is not None else -1

        person_roi = frame[y1:y2, x1:x2]
        gray_roi = cv2.cvtColor(person_roi, cv2.COLOR_BGR2GRAY)

        faces = dlib_detector(gray_roi)

        for face in faces:
            fx1 = x1 + face.left()
            fy1 = y1 + face.top()
            fx2 = x1 + face.right()
            fy2 = y1 + face.bottom()

            cv2.rectangle(annotated, (fx1, fy1), (fx2, fy2), (0, 255, 0), 2)

            # landmarks
            shape = predictor(gray_roi, face)
            shape = face_utils.shape_to_np(shape) + np.array([x1, y1])

            # рисуем все 68 точек
            for (i, (px, py)) in enumerate(shape):
                cv2.circle(annotated, (px, py), 2, (0, 255, 0), -1)

            # анализ глаз
            left_eye = shape[42:48]
            right_eye = shape[36:42]
            ear = (eye_aspect_ratio(left_eye) + eye_aspect_ratio(right_eye)) / 2.0
            eyes_open = ear > 0.25

            # анализ улыбки
            smiling, mid = is_smiling(shape)
            cv2.circle(annotated, tuple(mid), 3, (0, 0, 255), -1)

            # вывод информации
            cv2.putText(annotated, f"ID {track_id}", (x1, y1-10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 0), 2)
            cv2.putText(annotated, f"Eyes: {eyes_open}", (fx1, fy1-25),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 255), 2)
            cv2.putText(annotated, f"Smile: {smiling}", (fx1, fy1-5),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 255), 2)

    # показать
    cv2.imshow("People + Face Analysis + Tracking", annotated)

    if cv2.waitKey(1) & 0xFF == 27:
        break

cap.release()
cv2.destroyAllWindows()



0: 384x640 12 persons, 74.6ms
Speed: 7.1ms preprocess, 74.6ms inference, 15.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 persons, 83.0ms
Speed: 2.4ms preprocess, 83.0ms inference, 2.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 persons, 97.9ms
Speed: 2.0ms preprocess, 97.9ms inference, 2.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 persons, 61.0ms
Speed: 1.6ms preprocess, 61.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 13 persons, 56.4ms
Speed: 1.7ms preprocess, 56.4ms inference, 2.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 persons, 48.1ms
Speed: 2.1ms preprocess, 48.1ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 persons, 51.6ms
Speed: 1.9ms preprocess, 51.6ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 13 persons, 44.5ms
Speed: 1.4ms preprocess, 44.5ms inference, 2.4ms postprocess per image a

  smile_metric = (left_lift + right_lift)/2 + (mouth_width / mouth_height)



0: 384x640 15 persons, 45.9ms
Speed: 2.0ms preprocess, 45.9ms inference, 2.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 persons, 47.8ms
Speed: 1.8ms preprocess, 47.8ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 persons, 47.1ms
Speed: 1.8ms preprocess, 47.1ms inference, 2.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 persons, 52.5ms
Speed: 1.8ms preprocess, 52.5ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 persons, 95.1ms
Speed: 2.2ms preprocess, 95.1ms inference, 2.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 13 persons, 46.4ms
Speed: 2.2ms preprocess, 46.4ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 13 persons, 53.6ms
Speed: 1.6ms preprocess, 53.6ms inference, 2.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 persons, 46.1ms
Speed: 1.9ms preprocess, 46.1ms inference, 1.6ms postprocess per image at

KeyboardInterrupt: 

In [5]:
import cv2
import dlib
from imutils import face_utils
from ultralytics import YOLO

person_model = YOLO("yolov8n.pt")
tracker = None

predictor_path = "shape_predictor_68_face_landmarks.dat"
detector = dlib.get_frontal_face_detector()
predictor = dlib.shape_predictor(predictor_path)

def eye_aspect_ratio(eye):
    A = np.linalg.norm(eye[1] - eye[5])
    B = np.linalg.norm(eye[2] - eye[4])
    C = np.linalg.norm(eye[0] - eye[3])
    return (A + B) / (2.0 * C)

def is_smiling(landmarks):
    left_corner = landmarks[48]
    right_corner = landmarks[54]
    top_lip = landmarks[51]
    bottom_lip = landmarks[57]

    mouth_width = np.linalg.norm(left_corner - right_corner)
    mouth_height = np.linalg.norm(top_lip - bottom_lip)
    mouth_center_y = (top_lip[1] + bottom_lip[1]) / 2
    left_lift = mouth_center_y - left_corner[1]
    right_lift = mouth_center_y - right_corner[1]

    smile_metric = (left_lift + right_lift)/2 + (mouth_width / mouth_height)
    is_smile = smile_metric > 7.0
    middle_point = ((left_corner + right_corner) / 2).astype(int)
    return is_smile, middle_point

cap = cv2.VideoCapture("vid_3.mp4")

while True:
    ret, frame = cap.read()
    if not ret:
        break

    results = person_model.track(frame, classes=[0], persist=True)[0]

    annotated_frame = frame.copy()
    for box, score, id in zip(results.boxes.xyxy, results.boxes.conf, results.boxes.id):
        x1, y1, x2, y2 = [int(c) for c in box]
        # Тонкая рамка
        cv2.rectangle(annotated_frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(annotated_frame, f"ID:{id}", (x1, y1-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,255,0), 1)

    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    faces = detector(gray)

    for face in faces:
        shape = predictor(gray, face)
        shape = face_utils.shape_to_np(shape)

        # Рисуем 68 точек лица
        for (i, (x, y)) in enumerate(shape):
            cv2.circle(annotated_frame, (x, y), 1, (0, 255, 255), -1)

        hull = cv2.convexHull(shape)
        cv2.drawContours(annotated_frame, [hull], -1, (255, 0, 0), 1)

        # Проверка глаз
        left_eye = shape[42:48]
        right_eye = shape[36:42]
        ear = (eye_aspect_ratio(left_eye) + eye_aspect_ratio(right_eye)) / 2.0

        # Проверка улыбки
        smiling, mouth_middle = is_smiling(shape)
        eye_status = "Open" if ear > 0.25 else "Closed"
        smile_status = "Yes" if smiling else "No"

        cv2.circle(annotated_frame, tuple(mouth_middle), 2, (0, 0, 255), -1)
        # Координаты лица
        x1, y1, x2, y2 = face.left(), face.top(), face.right(), face.bottom()
        # Рисуем текст чуть выше головы
        cv2.putText(annotated_frame, f"Eyes:{eye_status}", (x1, y1 - 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,255,0), 1)
        cv2.putText(annotated_frame, f"Smile:{smile_status}", (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,255,0), 1)

    cv2.imshow("People + Faces + Tracking", annotated_frame)

    if cv2.waitKey(1) & 0xFF == 27:  # ESC для выхода
        break

cap.release()
cv2.destroyAllWindows()



0: 384x640 12 persons, 76.4ms
Speed: 2.5ms preprocess, 76.4ms inference, 2.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 persons, 54.0ms
Speed: 1.7ms preprocess, 54.0ms inference, 2.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 persons, 60.5ms
Speed: 2.8ms preprocess, 60.5ms inference, 2.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 persons, 59.2ms
Speed: 1.3ms preprocess, 59.2ms inference, 2.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 13 persons, 54.5ms
Speed: 1.8ms preprocess, 54.5ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 persons, 50.4ms
Speed: 1.5ms preprocess, 50.4ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 persons, 49.9ms
Speed: 1.6ms preprocess, 49.9ms inference, 2.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 13 persons, 48.7ms
Speed: 1.4ms preprocess, 48.7ms inference, 1.7ms postprocess per image at

KeyboardInterrupt: 

KeyboardInterrupt: 

In [6]:
import cv2
import numpy as np
from ultralytics import YOLO
from collections import defaultdict
from scipy.signal import find_peaks

model = YOLO("yolov8s-pose.pt")

history = defaultdict(lambda: [])
max_hist = 60  # 2 секунды при 30 fps

cap = cv2.VideoCapture("vid_2.mp4")
fps = cap.get(cv2.CAP_PROP_FPS) or 30

while True:
    ret, frame = cap.read()
    if not ret: break
    annotated = frame.copy()

    results = model.track(
        frame,
        persist=True,
        tracker="bytetrack.yaml",
        classes=[0],
        conf=0.3,
        iou=0.5,
        verbose=False
    )[0]

    if results.boxes.id is not None:
        for box, track_id, kpts_xy, kpts_conf in zip(
            results.boxes.xyxy,
            results.boxes.id,
            results.keypoints.xy.cpu().numpy(),
            results.keypoints.conf.cpu().numpy()
        ):
            track_id = int(track_id)
            x1, y1, x2, y2 = map(int, box)

            lk = kpts_xy[13]
            rk = kpts_xy[14]
            conf_l, conf_r = kpts_conf[13], kpts_conf[14]

            if conf_l > 0.5 and conf_r > 0.5 and lk[0] > 0 and rk[0] > 0:
                lk_pt = (int(lk[0]), int(lk[1]))
                rk_pt = (int(rk[0]), int(rk[1]))

                cv2.circle(annotated, lk_pt, 10, (255, 0, 255), -1)
                cv2.circle(annotated, rk_pt, 10, (255, 0, 255), -1)

                avg_y = (lk[1] + rk[1]) / 2
                history[track_id].append(avg_y)
                if len(history[track_id]) > max_hist:
                    history[track_id].pop(0)

                if len(history[track_id]) > 15:
                    y_vals = np.array(history[track_id])
                    peaks, _ = find_peaks(y_vals, distance=int(fps*0.4), prominence=10)
                    steps_per_min = len(peaks) / (len(y_vals)/fps) * 60
                    step_text = f"{int(steps_per_min)} steps/min"
                else:
                    step_text = "—"

                cv2.putText(annotated, step_text, (x1, y1-10),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 255), 2)

            cv2.rectangle(annotated, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(annotated, f"ID:{track_id}", (x1, y1-30),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)

    cv2.imshow("YOLOv8-pose + ByteTrack", annotated)
    if cv2.waitKey(1) == 27:  # ESC
        break

cap.release()
cv2.destroyAllWindows()

ERROR! Session/line number was not unique in database. History logging moved to new session 186



KeyboardInterrupt

