### Commands
python detect.py --weights ../yolov5m6.pt --img 1024 --conf 0.25 --source 0

## References
https://github.com/ultralytics/yolov5/issues/36

### Model Initialization

In [1]:
import ctypes
from ctypes import windll
from IPython.display import clear_output, display
import time
from win32 import win32api
import win32con

import cv2
import matplotlib.pyplot as plt
import mss
import numpy as np
from PIL import Image, ImageGrab
import pandas as pd
import pyautogui
import random
import torch
import torchvision

In [2]:
if torch.cuda.is_available():
    print(torch.cuda.get_device_name())
print(torch.__version__)

NVIDIA GeForce RTX 3070 Laptop GPU
1.11.0+cu113


In [3]:
file = open("labels.txt", "r")
content = file.read()
labels = content.split("\n")

In [4]:
# BBOX_COLORS = [(255,97,3), (255,64,64), (127,255,212), (227,207,87), (118,238,0)]
BBOX_THICKNESS = 2

FONT_FAMILY = cv2.FONT_HERSHEY_DUPLEX
TEXT_COLOR = (255, 255, 255)
TEXT_SIZE = 0.8
TEXT_THICKNESS = 2

color = (255,153,51)

In [5]:
model = torch.hub.load('ultralytics/yolov5', 'yolov5m6')

Using cache found in C:\Users\alexc/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2022-6-26 Python-3.8.13 torch-1.11.0+cu113 CUDA:0 (NVIDIA GeForce RTX 3070 Laptop GPU, 8192MiB)

Fusing layers... 
YOLOv5m6 summary: 378 layers, 35704908 parameters, 0 gradients
Adding AutoShape... 


In [6]:
model.conf = 0.25  # NMS confidence threshold
model.iou = 0.45  # NMS IoU threshold
model.agnostic = False  # NMS class-agnostic
model.multi_label = False  # NMS multiple labels per box
model.classes = None  # (optional list) filter by class, i.e. = [0, 15, 16] for COCO persons, cats and dogs
model.max_det = 1000  # maximum number of detections per image
model.amp = False   # Automatic Mixed Precision (AMP) inference

### Using file image

In [55]:
img = Image.open("bus.jpg")
# img = cv2.imread("bus.jpg")[..., ::-1]  # OpenCV image (BGR to RGB)

In [56]:
results = model(img, size=640)
results.show()

### Using camera

In [None]:
vid = cv2.VideoCapture(0)

  
while(True):
    # Camera capture
    ret, frame = vid.read()
    
    # Inference
    results = model(frame)
    for result in results.xyxy[0]:
        np_result = result.cpu().detach().numpy()
        xmin, ymin, xmax, ymax, confidence, class_name = np_result
        xmin, ymin, xmax, ymax, class_name = list(map(lambda a:int(a), (xmin, ymin, xmax, ymax, class_name)))
        
        cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), color, BBOX_THICKNESS)
        
        label = labels[class_name]
        confidence = str(confidence.round(2))
        text = f"{label}: {confidence}"
        
        (w, h), _ = cv2.getTextSize(text, FONT_FAMILY, TEXT_SIZE, 1)
        cv2.rectangle(frame, (xmin, ymin-h), (xmin+w, ymin), color, -1)
        cv2.putText(frame, text, (xmin, ymin), FONT_FAMILY, TEXT_SIZE, TEXT_COLOR, TEXT_THICKNESS)
  
    cv2.imshow('frame', frame)
      
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

vid.release()
cv2.destroyAllWindows()

### Using computer screen

In [7]:
with mss.mss() as mss_instance:
    for monitor in mss_instance.monitors:
        print(monitor)

{'left': -2880, 'top': 0, 'width': 5440, 'height': 1620}
{'left': 0, 'top': 0, 'width': 2560, 'height': 1600}
{'left': -2880, 'top': 0, 'width': 2880, 'height': 1620}


In [9]:
MOUSEEVENTF_MOVE = 0x0001 # mouse move 
MOUSEEVENTF_LEFTDOWN = 0x0002 # left button down 
MOUSEEVENTF_LEFTUP = 0x0004 # left button up 
MOUSEEVENTF_RIGHTDOWN = 0x0008 # right button down 
MOUSEEVENTF_RIGHTUP = 0x0010 # right button up 
MOUSEEVENTF_MIDDLEDOWN = 0x0020 # middle button down 
MOUSEEVENTF_MIDDLEUP = 0x0040 # middle button up 
MOUSEEVENTF_WHEEL = 0x0800 # wheel button rolled 
MOUSEEVENTF_ABSOLUTE = 0x8000 # absolute move 
    
# Moves the mouse cursor to position (final_x, final_y)
def move_mouse(final_x, final_y):
    curr_x, curr_y = pyautogui.position()
    print(f"Current position {(curr_x, curr_y)}")
    windll.user32.mouse_event(MOUSEEVENTF_MOVE + MOUSEEVENTF_ABSOLUTE, final_x, final_y, 0,0)
    pyautogui.mouseDown(button='left')
    
def get_mouse_position():
    while True:
        try:
            time.sleep(0.5)
            clear_output(wait=True)
            print(pyautogui.position())
        except KeyboardInterrupt:
            break

In [15]:
sct = mss.mss()

MONITOR = {
    "COMPUTER": sct.monitors[1],
    "EXTERNAL": sct.monitors[2]
}

START_POS = {
    "COMPUTER": (300, 300),
    "EXTERNAL": (-300, 200)
}

# window where the game is being played on (COMPUTER/EXTERNAL)
SOURCE_WINDOW = "COMPUTER"

# Tab into the game computer window
pyautogui.moveTo(START_POS[SOURCE_WINDOW])
pyautogui.click()

while True:
# for _ in range(100):
    screenShot = sct.grab(MONITOR[SOURCE_WINDOW])
    frame = Image.frombytes(
        'RGB', 
        (screenShot.width, screenShot.height), 
        screenShot.rgb, 
    )
    frame = np.array(frame)
    
    # Inference
    results = model(frame)
    for result in results.xyxy[0]:
        np_result = result.cpu().detach().numpy()
        xmin, ymin, xmax, ymax, confidence, class_name = np_result
        xmin, ymin, xmax, ymax, class_name = list(map(lambda a:int(a), (xmin, ymin, xmax, ymax, class_name)))
        
        if labels[class_name] != "person":
            continue
            
        # Calculate the final position where the cursor should be
        # The x coordinate should be the middle
        final_x = int((xmin + xmax)/2)
        # The y coordinate should be a little higher than the centre (30% from the top rather than the 50% middle)
        final_y = int(ymin + (ymax - ymin)*0.3)
        
        # Clear output for clean printing to the notebook
        clear_output(wait=True)
        cv2.circle(frame, (final_x, final_y), 3, (0,255,0), -1)
        print(f"Target position: {(final_x, final_y)}")
        
        move_mouse(final_x, final_y)
        
        cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), color, BBOX_THICKNESS)
        
        label = labels[class_name]
        confidence = str(confidence.round(2))
        text = f"{label}: {confidence}"
        
        (w, h), _ = cv2.getTextSize(text, FONT_FAMILY, TEXT_SIZE, 1)
        cv2.rectangle(frame, (xmin, ymin-h), (xmin+w, ymin), color, -1)
        cv2.putText(frame, text, (xmin, ymin), FONT_FAMILY, TEXT_SIZE, TEXT_COLOR, TEXT_THICKNESS)
  
    cv2.imshow('frame', cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
      
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cv2.destroyAllWindows()

Target position: (1400, 1001)
Current position (1280, 800)
