#Paquetes necesarios

In [1]:
import cv2  
import math 
from ultralytics import YOLO

Modelos preentrenados, visualizando con las utilidades de ultralytics

In [None]:
# Carga del modelo
#model = YOLO('yolo11n.pt') #Contenedores
#model = YOLO('yolo11n-seg.pt') #Máscaras
model = YOLO('yolo11n-pose.pt')  #Pose

#Para un vídeo 
filename = "TGC23_PdH_C0056cut.mp4"
results = model(filename, show=True)

cv2.destroyAllWindows()

Desde cámara, detección con yolo11, modelo nano. Visualización propia con OpenCV

In [3]:
# Carga del modelo, descarga en disco si no está presente en la carpeta
model = YOLO('yolo11n.pt') #Contenedores

# Etiqueta de las distintas clases
classNames = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat",
              "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
              "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
              "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
              "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup",
              "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli",
              "carrot", "hot dog", "pizza", "donut", "cake", "chair", "sofa", "pottedplant", "bed",
              "diningtable", "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard", "cell phone",
              "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
              "teddy bear", "hair drier", "toothbrush"
              ]


# Captura desde la webcam
vid = cv2.VideoCapture(0)
  
while(True):      
    # fotograma a fotograma
    ret, img = vid.read()
  
    # si hay imagen válida
    if ret:  
        # Detecta en la imagen
        results = model(img, stream=True)
        
        # Para cada detección
        for r in results:
            boxes = r.boxes

            for box in boxes:
                # Contenedor
                x1, y1, x2, y2 = box.xyxy[0]
                x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) # convert to int values
                
                # Confianza
                confidence = math.ceil((box.conf[0]*100))/100
                print("Confianza --->",confidence)

                # Clase
                cls = int(box.cls[0])
                print("Clase -->", classNames[cls])

                # Convierte identificador numérico de clase a un color RGB
                escala = int((cls / len(classNames)) * 255 * 3)
                if escala >= 255*2:
                    R = 255
                    G = 255
                    B = escala - 255*2
                else:
                    if escala >= 255:
                        R = 255
                        G = escala - 255
                        B = 0
                    else:
                        R = escala
                        G = 0
                        B = 0

                # Dibuja el contenedor y clase
                cv2.rectangle(img, (x1, y1), (x2, y2), (R, G, B), 3)
                cv2.putText(img, classNames[cls] , [x1, y1], cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, B), 2)

        # Muestra fotograma
        cv2.imshow('Vid', img)
    
    # Detenemos pulsado ESC
    if cv2.waitKey(20) == 27:
        break
  
# Libera el objeto de captura
vid.release()
# Destruye ventanas
cv2.destroyAllWindows()

[KDownloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11n.pt to 'yolo11n.pt': 100% ━━━━━━━━━━━━ 5.4MB 16.4MB/s 0.3s.2s<0.3s4s

0: 480x640 1 person, 105.7ms
Confianza ---> 0.38
Clase --> person
Speed: 2.3ms preprocess, 105.7ms inference, 5.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 25.7ms
Confianza ---> 0.82
Clase --> person
Speed: 3.0ms preprocess, 25.7ms inference, 3.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 24.2ms
Confianza ---> 0.35
Clase --> person
Confianza ---> 0.29
Clase --> person
Speed: 3.4ms preprocess, 24.2ms inference, 2.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 23.4ms
Confianza ---> 0.73
Clase --> person
Speed: 3.1ms preprocess, 23.4ms inference, 2.7ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 25.6ms
Confianza ---> 0.42
Clase --> person
Confianza ---> 0.26
Clase --> person
Speed: 2.7ms preprocess, 25.6ms inference, 2.9ms p

Seguimiento. Requiere instalar lap con pip install lap

In [4]:
from collections import defaultdict
import numpy as np

# Carga del modelo, descarga en disco si no está presente en la carpeta
model = YOLO('yolo11n.pt') #Contenedores

# Etiqueta de las distintas clases
classNames = ["person", "bicycle", "car"]


# Captura desde la webcam
vid = cv2.VideoCapture(0)
track_history = defaultdict(lambda: [])
  
while(True):      
    # fotograma a fotograma
    ret, img = vid.read()
  
    # si hay imagen válida
    if ret:  
        # Seguimiento, con persistencia entre fotogramas
        results = model.track(img, persist=True, classes = [0,1,2])

        if 0:
            if results is not None:
                print(results[0])
                boxes = results[0].boxes.xywh.cpu()
                track_ids = results[0].boxes.id.int().cpu().tolist()
                annotated_frame = results[0].plot()
                for box, track_id in zip(boxes, track_ids):
                    x, y, w, h = box
                    track = track_history[track_id]
                    track.append((float(x), float(y)))
                    if len(track) > 30:
                        track.pop(0)
                    points = np.hstack(track).astype(np.int32).reshape((-1, 1, 2))
                    cv2.polylines(annotated_frame, [points], isClosed=False, color=(230, 230, 230), thickness=10)
                cv2.imshow("YOLO11 Tracking", annotated_frame)
                if cv2.waitKey(1) & 0xFF == ord("q"):
                    break
        

        
        # Para cada detección
        for r in results:
            boxes = r.boxes

            for box in boxes:
                # Contenedor
                x1, y1, x2, y2 = box.xyxy[0]
                x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) # convert to int values

                #Etiqueta de seguimiento
                if box.id is not None:
                    track_id = str(int(box.id[0].tolist()))
                else:
                    track_id = ''
                
                # Confianza
                confidence = math.ceil((box.conf[0]*100))/100
                print("Confianza --->",confidence)

                # Clase
                cls = int(box.cls[0])
                print("Clase -->", classNames[cls])

                # Convierte identificador numérico de clase a un color RGB
                escala = int((cls / len(classNames)) * 255 * 3)
                if escala >= 255*2:
                    R = 255
                    G = 255
                    B = escala - 255*2
                else:
                    if escala >= 255:
                        R = 255
                        G = escala - 255
                        B = 0
                    else:
                        R = escala
                        G = 0
                        B = 0

                # Dibuja el contenedor y clase
                cv2.rectangle(img, (x1, y1), (x2, y2), (R, G, B), 3)
                cv2.putText(img, track_id + ' ' + classNames[cls] , [x1, y1], cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, B), 2)

        # Muestra fotograma
        cv2.imshow('Vid', img)
    
    # Detenemos pulsado ESC
    if cv2.waitKey(20) == 27:
        break
  
# Libera el objeto de captura
vid.release()
# Destruye ventanas
cv2.destroyAllWindows()




0: 480x640 1 person, 28.4ms
Speed: 2.8ms preprocess, 28.4ms inference, 28.6ms postprocess per image at shape (1, 3, 480, 640)
Confianza ---> 0.59
Clase --> person

0: 480x640 1 person, 27.0ms
Speed: 2.9ms preprocess, 27.0ms inference, 3.7ms postprocess per image at shape (1, 3, 480, 640)
Confianza ---> 0.44
Clase --> person

0: 480x640 1 person, 29.4ms
Speed: 3.0ms preprocess, 29.4ms inference, 4.8ms postprocess per image at shape (1, 3, 480, 640)
Confianza ---> 0.42
Clase --> person

0: 480x640 1 person, 27.3ms
Speed: 3.2ms preprocess, 27.3ms inference, 3.6ms postprocess per image at shape (1, 3, 480, 640)
Confianza ---> 0.58
Clase --> person

0: 480x640 2 persons, 27.0ms
Speed: 3.4ms preprocess, 27.0ms inference, 3.8ms postprocess per image at shape (1, 3, 480, 640)
Confianza ---> 0.44
Clase --> person
Confianza ---> 0.4
Clase --> person

0: 480x640 2 persons, 27.1ms
Speed: 2.9ms preprocess, 27.1ms inference, 3.6ms postprocess per image at shape (1, 3, 480, 640)
Confianza ---> 0.4
C

Intregración con seguimiento (tracking)
!!!!!!!!!Nota: he tenido que bajar a la versión de python 3.9.5 e instalar lap con pip install lap

In [None]:
# Carga del modelo
model = YOLO('yolo11n.pt') #Contenedores
#model = YOLO('yolov11n-seg.pt') #Máscaras
#model = YOLO('yolo11n-pose.pt')  #Pose

#Para un vídeo 
filename = "TGC23_PdH_C0056cut.mp4"
results = model.track(source=filename, show=True)  # BoT-SORT tracker (por defecto)
#results = model.track(source=filename, show=True, tracker="bytetrack.yaml")  # ByteTrack tracker

cv2.destroyAllWindows()