In [2]:
# All imports
import cv2
from PIL import Image
import numpy as np
import torch

In [3]:
def recordVideo(output):
    cap = cv2.VideoCapture(0)
    # Define the codec and create a VideoWriter object
    size = (
        int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
        int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)),
    )
    out = cv2.VideoWriter(output, cv2.VideoWriter_fourcc(*'mp4v'), 30, size)
    while True:
        ret, frame = cap.read()
        if ret:
            out.write(frame)
            # Write the frame to the output file
            frame = cv2.flip(frame, 1)

            # Display the resulting frame
            cv2.imshow('frame', frame)

            # Exit recording if 'q' is pressed
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
        else:
            break

    # Release resources
    cap.release()
    out.release()
    cv2.destroyAllWindows()
    cv2.waitKey(1)


In [4]:
def video_tracking(original_video_path, depth_video_path):
    # sourcery skip: low-code-quality
    # finger sur une vidéo

    # Set up video capture from default camera
    cap = cv2.VideoCapture(original_video_path)
    depth = cv2.VideoCapture(depth_video_path)

    # Set up MediaPipe hand detection
    mpHands = mp.solutions.hands
    hands = mpHands.Hands()
    mpDraw = mp.solutions.drawing_utils

    # Initialize list of finger positions
    finger_positions = []
    images = []
    # Main loop for video capture and hand detection
    imageCompteur = 0
    while cap.isOpened():
        # Capture a frame from the camera
        success, image = cap.read()

        # Check if the frame was successfully read
        if not success:
            break

        if depth.isOpened():
            # Capture a frame from the camera
            success_depth, image_depth = depth.read()

            # Check if the frame was successfully read
            if not success_depth:
                break

            image = cv2.flip(image, 1)
            image_depth = cv2.flip(image_depth, 1)

            mask = image_depth > 220
            sumMask = sum(sum(sum(mask)))

            # Convert the color space of the image from BGR to RGB
            imageRGB = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

            # Use MediaPipe to detect hand landmarks in the image
            results = hands.process(imageRGB)

            # Check if any hands were detected in the image
            if results.multi_hand_landmarks:
                # Loop through all detected hands
                for handLms in results.multi_hand_landmarks:
                    # Loop through all the landmarks of the current hand
                    hasHeight = False
                    for id, lm in enumerate(handLms.landmark):
                        # Get the pixel coordinates of the landmark
                        h, w, c = image.shape
                        cx, cy = int(lm.x * w), int(lm.y * h)

                        # If the current landmark is the tip of the index finger, add its position to the list
                        if id == 8:
                            hasHeight = True
                            # image_depth[cy][cx][0] > 220 :
                            if sumMask > 115000:
                                finger_positions.append((cx, cy))
                                cv2.circle(image, (cx, cy), 10,
                                           (255, 0, 255), cv2.FILLED)
                                imageCompteur = 0
                            else:
                                imageCompteur += 1

                    if not hasHeight:
                        imageCompteur += 1
                    if imageCompteur >= 5:
                        finger_positions.append("stop")
                        imageCompteur = 0

                    # Draw the landmarks and connections on the image using MediaPipe
                    mpDraw.draw_landmarks(
                        image, handLms, mpHands.HAND_CONNECTIONS)

            # If there are any finger positions in the list, draw a curve passing through all of them
            finger_positions.append("stop")
            if finger_positions:
                # split array
                curve = []
                for i in finger_positions:
                    if i != "stop":
                        curve.append(i)
                    elif len(curve) > 0:
                        curve = np.array(curve)
                        cv2.polylines(image, [curve], False, (255, 0, 0), 3)
                        curve = []
            finger_positions.pop()

            # Display the image on the screen
            cv2.imshow("Output", image)
            images.append(image)

            # Check for the Esc key to stop the program
            if cv2.waitKey(5) & 0xFF == 27:
                break

    # Define the codec and create VideoWriter object
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    savePath = depth_video_path.split('\\')[0] + "\\tracking.mp4"

    out = cv2.VideoWriter(savePath, fourcc, 30, (640, 480))
    print(f"video save at: {savePath}")

    # Main loop for writing video
    for img in images:
        out.write(img)

    out.release()
    # Release the video capture object and close all windows
    cap.release()
    depth.release()
    cv2.destroyAllWindows()


In [1]:
import torch

In [13]:
def midas_prediction(frame):
    #start = time.time()
    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    input_batch = transform(img).to(device)

    with torch.no_grad():
        prediction = midas(input_batch)

        prediction = torch.nn.functional.interpolate(
            prediction.unsqueeze(1),
            size=img.shape[:2],
            mode="bicubic",
            align_corners=False,
        ).squeeze()
    output = prediction.cpu().numpy()
    return output


In [15]:
def midas_depthVideo(video_path):

    cap = cv2.VideoCapture(video_path)
    video_path = video_path.split('\\')[0] + "\depth_video.mp4"
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    size = (
        int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
        int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)),
    )
    video = cv2.VideoWriter(video_path, fourcc, 30, size)
    for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
        is_read, frame = cap.read()
        if is_read:
            output = midas_prediction(frame)
            formatted = (output * 255 / np.max(output)).astype("uint8")
            img = Image.fromarray(formatted)
            img.save('img.jpg')
            image = cv2.imread('img.jpg')
            video.write(image)
        else:
            break
    cap.release()
    video.release()
    print("La vidéo a été créée avec succès !")


In [71]:
# main
output = 'video\init.mp4'
#recordVideo(output)

# Model selection
model_type = "DPT_Large"      # MiDaS v3 - Large     (highest accuracy, slowest inference speed)
#model_type = "DPT_Hybrid"     # MiDaS v3 - Hybrid    (medium accuracy, medium inference speed)
#model_type = "MiDaS_small"      # MiDaS v2.1 - Small   (lowest accuracy, highest inference speed)

#Model Loading
midas = torch.hub.load("intel-isl/MiDaS", model_type)
device = torch.device(
    "cuda") if torch.cuda.is_available() else torch.device("cpu")
midas.to(device)
midas.eval()

midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms")

if model_type == "DPT_Large" or model_type == "DPT_Hybrid":
    transform = midas_transforms.dpt_transform
else:
    transform = midas_transforms.small_transform

#Depth video creation
video_path = midas_depthVideo(output)
video_tracking(output, video_path)


Using cache found in /Users/quentin/.cache/torch/hub/intel-isl_MiDaS_master
Using cache found in /Users/quentin/.cache/torch/hub/intel-isl_MiDaS_master


La vidéo a été créée avec succès !
