Real time prediction from YOLO11

In [None]:
# Finalized CNN with better accuracy

import torch
import torch.nn as nn
import torchvision.transforms as transforms
import cv2
import numpy as np

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Class labels (update to match your dataset folder order)
class_labels = ['Angry', 'Boring', 'Disgust', 'Fear', 'Happy', 'Neural', 'Sad', 'Stress', 'Suprise']

# Model definition (must match training)
class EmotionCNN(nn.Module):
    def __init__(self, num_classes):
        super(EmotionCNN, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 32, 3, padding=1), nn.ReLU(),
            nn.BatchNorm2d(32),
            nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(),
            nn.BatchNorm2d(64),
            nn.MaxPool2d(2), nn.Dropout(0.25),

            nn.Conv2d(64, 128, 3, padding=1), nn.ReLU(),
            nn.BatchNorm2d(128),
            nn.MaxPool2d(2), nn.Dropout(0.25)
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128 * 16 * 16, 256), nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

# Load model
model = EmotionCNN(len(class_labels)).to(device)
model.load_state_dict(torch.load("Models/emotion_model.pth", map_location=device))
model.eval()

# Preprocessing for webcam frames
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5])
])

# Load Haar Cascade for face detection
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

cap = cv2.VideoCapture(0)
print("Press 'q' to quit.")
import time

last_prediction_time = 0
prediction_interval = 2  # seconds
last_labels = []

while True:
    ret, frame = cap.read()
    if not ret:
        break
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5)
    current_time = time.time()
    if current_time - last_prediction_time >= prediction_interval:
        last_labels = []
        for (x, y, w, h) in faces:
            face = gray[y:y+h, x:x+w]
            face_resized = cv2.resize(face, (64, 64))
            face_tensor = transform(face_resized).unsqueeze(0).to(device)
            with torch.no_grad():
                output = model(face_tensor)
                pred = torch.argmax(output, 1).item()
                label = class_labels[pred]
            last_labels.append((x, y, w, h, label))
        last_prediction_time = current_time
    # Draw rectangles and labels from last_labels
    for (x, y, w, h, label) in last_labels:
        color = (0, 255, 0)
        cv2.rectangle(frame, (x, y), (x+w, y+h), color, 2)
        cv2.putText(frame, label, (x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, color, 2)
    cv2.imshow('Webcam Emotion Recognition', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break
cap.release()
cv2.destroyAllWindows()

Press 'q' to quit.


In [None]:
# Finalized MobileNet with better accuracy

import torch
import torch.nn as nn
import torchvision.transforms as transforms
import cv2
import numpy as np
import torchvision.models as models
from PIL import Image

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Class labels (update to match your dataset folder order)
class_labels = ['Angry', 'Boring', 'Disgust', 'Fear', 'Happy', 'Neural', 'Sad', 'Stress', 'Suprise']

# Initialize the model with the same architecture as when trained
model = models.mobilenet_v3_large(pretrained=False)  # Don't load pretrained weights
num_classes = 9  # Update to match your class count
model.classifier[3] = nn.Linear(model.classifier[3].in_features, num_classes)  # Modify last layer

# Now load the state_dict
model.load_state_dict(torch.load("Models/mobilenet_v3_large_finetuned_1000.pth", map_location=device))
model = model.to(device)
model.eval()

# Preprocessing for webcam frames
transform = transforms.Compose([
    transforms.Resize((224, 224)),      
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Load Haar Cascade for face detection
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

cap = cv2.VideoCapture(0)
print("Press 'q' to quit.")
import time

last_prediction_time = 0
prediction_interval = 2  # seconds
last_labels = []

while True:
    ret, frame = cap.read()
    if not ret:
        break
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5)
    current_time = time.time()
    if current_time - last_prediction_time >= prediction_interval:
        last_labels = []
        for (x, y, w, h) in faces:
            face = gray[y:y+h, x:x+w]
            face_resized = cv2.resize(face, (64, 64))
            face_pil = Image.fromarray(cv2.cvtColor(face_resized, cv2.COLOR_GRAY2RGB))  # Convert to RGB PIL
            face_tensor = transform(face_pil).unsqueeze(0).to(device)
            with torch.no_grad():
                output = model(face_tensor)
                pred = torch.argmax(output, 1).item()
                label = class_labels[pred]
            last_labels.append((x, y, w, h, label))
        last_prediction_time = current_time
    # Draw rectangles and labels from last_labels
    for (x, y, w, h, label) in last_labels:
        color = (0, 255, 0)
        cv2.rectangle(frame, (x, y), (x+w, y+h), color, 2)
        cv2.putText(frame, label, (x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, color, 2)
    cv2.imshow('Webcam Emotion Recognition from MobileNet', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break
cap.release()
cv2.destroyAllWindows()



Press 'q' to quit.


In [None]:
# Real Time Prediction Code with YOLO classification without face detection

from ultralytics import YOLO
import cv2

# Load your custom model
model = YOLO("Models/best_new.pt")

# Open the webcam
cap = cv2.VideoCapture(0)  # 0 is usually the default webcam

while True:
    # Read a frame from the webcam
    ret, frame = cap.read()
    if not ret:
        break

    # Run inference on the frame
    results = model(frame)

    # Visualize the results on the frame
    annotated_frame = results[0].plot()

    # Display the annotated frame
    cv2.imshow("YOLOv11 Inference", annotated_frame)

    # Break the loop if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the webcam and close windows
cap.release()
cv2.destroyAllWindows()


0: 448x448 boring 0.94, disgust 0.02, stress 0.01, sad 0.01, neutral 0.01, 13.2ms
Speed: 13.2ms preprocess, 13.2ms inference, 0.1ms postprocess per image at shape (1, 3, 448, 448)

0: 448x448 boring 0.89, disgust 0.05, stress 0.02, sad 0.02, neutral 0.01, 11.4ms
Speed: 12.4ms preprocess, 11.4ms inference, 0.1ms postprocess per image at shape (1, 3, 448, 448)

0: 448x448 boring 0.79, disgust 0.09, sad 0.04, neutral 0.03, stress 0.03, 11.4ms
Speed: 14.0ms preprocess, 11.4ms inference, 0.1ms postprocess per image at shape (1, 3, 448, 448)

0: 448x448 boring 0.91, disgust 0.03, sad 0.02, neutral 0.02, stress 0.01, 10.8ms
Speed: 11.4ms preprocess, 10.8ms inference, 0.1ms postprocess per image at shape (1, 3, 448, 448)

0: 448x448 boring 0.92, disgust 0.02, neutral 0.02, sad 0.02, happy 0.01, 14.0ms
Speed: 11.7ms preprocess, 14.0ms inference, 0.1ms postprocess per image at shape (1, 3, 448, 448)

0: 448x448 boring 0.94, disgust 0.02, neutral 0.01, sad 0.01, happy 0.01, 11.8ms
Speed: 10.6ms 

In [1]:
# YOLO with harcascase face detection

import cv2
from ultralytics import YOLO

# Load Haar Cascade face detector
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")

# Load your YOLO classification model
model = YOLO("Models/best_new.pt")


# Open the webcam
cap = cv2.VideoCapture(0)

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Convert frame to grayscale for face detection
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    # Detect faces
    faces = face_cascade.detectMultiScale(gray, 1.1, 4)  # (x, y, w, h) for each face

    for (x, y, w, h) in faces:
        # Crop the face from the frame
        face_img = frame[y:y+h, x:x+w]

        # (Optional) Resize face_img to the input size expected by your YOLO model
        # face_img = cv2.resize(face_img, (224, 224))  # Adjust size as needed

        # Run YOLO classification on the cropped face
        results = model(face_img)

        # Visualize results (optional: you can draw on the original frame)
        annotated_face = results[0].plot()
        frame[y:y+h, x:x+w] = cv2.resize(annotated_face, (w, h))

        # Draw bounding box around the face
        cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)

    # Show the frame with annotations
    cv2.imshow("Face Detection + YOLO Classification", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()



0: 448x448 sad 0.25, stress 0.23, angry 0.11, fear 0.11, neutral 0.10, 8.5ms
Speed: 10.2ms preprocess, 8.5ms inference, 0.1ms postprocess per image at shape (1, 3, 448, 448)

0: 448x448 sad 0.28, neutral 0.21, angry 0.10, fear 0.10, stress 0.09, 8.7ms
Speed: 7.0ms preprocess, 8.7ms inference, 0.1ms postprocess per image at shape (1, 3, 448, 448)

0: 448x448 neutral 0.48, sad 0.32, angry 0.11, disgust 0.03, fear 0.02, 8.5ms
Speed: 7.3ms preprocess, 8.5ms inference, 0.1ms postprocess per image at shape (1, 3, 448, 448)

0: 448x448 neutral 0.47, sad 0.36, angry 0.11, disgust 0.02, happy 0.01, 8.0ms
Speed: 8.7ms preprocess, 8.0ms inference, 0.1ms postprocess per image at shape (1, 3, 448, 448)

0: 448x448 stress 0.71, sad 0.10, neutral 0.08, happy 0.04, fear 0.03, 9.5ms
Speed: 6.9ms preprocess, 9.5ms inference, 0.1ms postprocess per image at shape (1, 3, 448, 448)

0: 448x448 neutral 0.48, sad 0.33, angry 0.07, disgust 0.05, suprised 0.02, 9.7ms
Speed: 9.0ms preprocess, 9.7ms inference, 0

KeyboardInterrupt: 

In [1]:
# Finalized Yolo from my previous work at apps for another dataset

from ultralytics import YOLO
import cv2

# Load your custom model
model = YOLO(r"D:\RuhunaNew\Academic\Research\Facial_Recog_Repo\Group_50_Repo\Models\best_new.pt")

# Open the webcam
cap = cv2.VideoCapture(0)  # 0 is usually the default webcam

import time

last_prediction_time = 0
prediction_interval = 4  # seconds
last_labels = []
last_confidences = []

while True:
    # Read a frame from the webcam
    ret, frame = cap.read()
    if not ret:
        break

    current_time = time.time()
    if current_time - last_prediction_time >= prediction_interval:
        last_labels = []
        last_confidences = []
        # Perform inference
        # Run inference on the frame
        results = model(frame)

        # Visualize the results on the frame
        annotated_frame = results[0].plot()

        # Display the annotated frame
        cv2.imshow("YOLOv11 Inference", annotated_frame)

        # Break the loop if 'q' is pressed
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

# Release the webcam and close windows
cap.release()
cv2.destroyAllWindows()


0: 448x448 boring 0.91, stress 0.04, happy 0.02, neutral 0.01, disgust 0.01, 8.4ms
Speed: 14.3ms preprocess, 8.4ms inference, 0.1ms postprocess per image at shape (1, 3, 448, 448)

0: 448x448 boring 0.97, stress 0.01, happy 0.01, disgust 0.00, neutral 0.00, 9.8ms
Speed: 11.5ms preprocess, 9.8ms inference, 0.1ms postprocess per image at shape (1, 3, 448, 448)

0: 448x448 boring 0.97, happy 0.01, stress 0.01, disgust 0.00, neutral 0.00, 10.1ms
Speed: 10.4ms preprocess, 10.1ms inference, 0.1ms postprocess per image at shape (1, 3, 448, 448)

0: 448x448 boring 0.96, disgust 0.02, happy 0.01, stress 0.00, neutral 0.00, 9.1ms
Speed: 11.6ms preprocess, 9.1ms inference, 0.1ms postprocess per image at shape (1, 3, 448, 448)

0: 448x448 boring 0.98, disgust 0.01, happy 0.00, neutral 0.00, stress 0.00, 11.5ms
Speed: 10.2ms preprocess, 11.5ms inference, 0.1ms postprocess per image at shape (1, 3, 448, 448)

0: 448x448 boring 0.95, disgust 0.03, neutral 0.00, happy 0.00, sad 0.00, 10.9ms
Speed: 11

KeyboardInterrupt: 

In [None]:
# Finalized yolo for our dataset



In [None]:

from ultralytics import YOLO
import cv2

# Load a model
model = YOLO("yolo11n.pt")  # load an official model
# Open the webcam
cap = cv2.VideoCapture(0)  # 0 is usually the default webcam

import time

last_prediction_time = 0
prediction_interval = 8  # seconds
last_labels = []
last_confidences = []

while True:
    ret, frame = cap.read()
    if not ret:
        break

    results = model(frame)
    for result in results:
        for box in result.boxes:
            class_id = int(box.cls)
            class_name = model.names[class_id]
            print(f"Detected class: {class_name}")

            # Optionally, get bounding box coordinates
            x1, y1, x2, y2 = box.xyxy[0].tolist()
            # Draw bounding box and label
            cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (255, 0, 255), 2)
            cv2.putText(frame, class_name, (int(x1), int(y1)-10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255,0,0), 2)

    cv2.imshow("YOLO Detection", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break


# Release the webcam and close windows
cap.release()
cv2.destroyAllWindows()


0: 480x640 1 person, 22.5ms
Speed: 2.1ms preprocess, 22.5ms inference, 3.5ms postprocess per image at shape (1, 3, 480, 640)
Detected class: person

0: 480x640 2 persons, 22.0ms
Speed: 2.4ms preprocess, 22.0ms inference, 3.3ms postprocess per image at shape (1, 3, 480, 640)
Detected class: person
Detected class: person

0: 480x640 2 persons, 1 chair, 25.6ms
Speed: 1.9ms preprocess, 25.6ms inference, 4.5ms postprocess per image at shape (1, 3, 480, 640)
Detected class: person
Detected class: person
Detected class: chair

0: 480x640 2 persons, 1 chair, 21.4ms
Speed: 2.4ms preprocess, 21.4ms inference, 3.1ms postprocess per image at shape (1, 3, 480, 640)
Detected class: person
Detected class: chair
Detected class: person

0: 480x640 1 person, 2 chairs, 21.5ms
Speed: 1.4ms preprocess, 21.5ms inference, 3.2ms postprocess per image at shape (1, 3, 480, 640)
Detected class: person
Detected class: chair
Detected class: chair

0: 480x640 1 person, 1 chair, 22.3ms
Speed: 1.5ms preprocess, 22.3

KeyboardInterrupt: 

### CNN + harcascade for face detection

In [None]:
from ultralytics import YOLO
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import cv2
import numpy as np
import time
from ultralytics import YOLO
import cv2
import time

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Class labels (update to match your dataset folder order)
class_labels = ['Angry', 'Boring', 'Disgust', 'Fear', 'Happy', 'Neural', 'Sad', 'Stress', 'Suprise']

def get_person_detected_frame():
    # Load a model
    model = YOLO("yolo11n.pt")
    while True:
        ret, frame = cap.read()
        if not ret:
            break
    results = model(frame)
    for result in results:
        for box in result.boxes:
            class_id = int(box.cls)
            class_name = model.names[class_id]
            # Optionally, get bounding box coordinates
            x1, y1, x2, y2 = box.xyxy[0].tolist()
            print(f"Detected class: {class_name}")
            if class_name == 'person':
                return frame


# Model definition (must match training)
class EmotionCNN(nn.Module):
    def __init__(self, num_classes):
        super(EmotionCNN, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 32, 3, padding=1), nn.ReLU(),
            nn.BatchNorm2d(32),
            nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(),
            nn.BatchNorm2d(64),
            nn.MaxPool2d(2), nn.Dropout(0.25),

            nn.Conv2d(64, 128, 3, padding=1), nn.ReLU(),
            nn.BatchNorm2d(128),
            nn.MaxPool2d(2), nn.Dropout(0.25)
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128 * 16 * 16, 256), nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

# Load model
model = EmotionCNN(len(class_labels)).to(device)
model.load_state_dict(torch.load("emotion_model.pth", map_location=device))
model.eval()

# Load Haar Cascade for face detection
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5])
])

# Open the webcam
cap = cv2.VideoCapture(0)  # 0 is usually the default webcam

last_prediction_time = 0
prediction_interval = 1  # seconds
last_labels = []

while True:
    ret, frame = cap.read()
    if not ret:
        break
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    # faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5)
    current_time = time.time()
    if current_time - last_prediction_time >= prediction_interval:
        last_labels = []
        face = get_person_detected_frame()
        face_resized = cv2.resize(face, (64, 64))
        face_tensor = transform(face_resized).unsqueeze(0).to(device)
        with torch.no_grad():
            output = model(face_tensor)
            pred = torch.argmax(output, 1).item()
            label = class_labels[pred]
        last_labels.append((x, y, w, h, label))
        last_prediction_time = current_time
        # Draw rectangles and labels from last_labels
    for (x, y, w, h, label) in last_labels:
        color = (0, 255, 0)
        cv2.rectangle(frame, (x, y), (x+w, y+h), color, 2)
        cv2.putText(frame, label, (x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, color, 2)
    cv2.imshow('Webcam Emotion Recognition', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break     
cap.release()
cv2.destroyAllWindows()

KeyboardInterrupt: 

In [None]:
from ultralytics import YOLO
import cv2
import torch
import torchvision.transforms as transforms
import torch.nn as nn
import time

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load face detection YOLO model (replace with your face detection model path)
face_detector = YOLO("yolo11n.pt")  # pretrained face detection model

# Emotion classes
class_labels = ['Angry', 'Boring', 'Disgust', 'Fear', 'Happy', 'Neural', 'Sad', 'Stress', 'Suprise']

class EmotionCNN(nn.Module):
    def __init__(self, num_classes):
        super(EmotionCNN, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 32, 3, padding=1), nn.ReLU(),
            nn.BatchNorm2d(32),
            nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(),
            nn.BatchNorm2d(64),
            nn.MaxPool2d(2), nn.Dropout(0.25),

            nn.Conv2d(64, 128, 3, padding=1), nn.ReLU(),
            nn.BatchNorm2d(128),
            nn.MaxPool2d(2), nn.Dropout(0.25)
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128 * 16 * 16, 256), nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x


emotion_model = EmotionCNN(len(class_labels)).to(device)
emotion_model.load_state_dict(torch.load("emotion_model.pth", map_location=device))
emotion_model.eval()

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5])
])

cap = cv2.VideoCapture(0)

last_prediction_time = 0
prediction_interval = 4
predictions = [] 

while True:
    ret, frame = cap.read()
    if not ret:
        break

    current_time = time.time()
    if current_time - last_prediction_time >= prediction_interval:
        predictions = []  # Clear old predictions

        # Detect faces using YOLO face detector
        results = face_detector(frame)

        for result in results:
            for box in result.boxes:
                x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())

                # Crop face region
                face_img = frame[y1:y2, x1:x2]

                # Preprocess face for emotion recognition
                face_gray = cv2.cvtColor(face_img, cv2.COLOR_BGR2GRAY)
                face_resized = cv2.resize(face_gray, (64, 64))
                face_tensor = transform(face_resized).unsqueeze(0).to(device)

                with torch.no_grad():
                    output = emotion_model(face_tensor)
                    pred = torch.argmax(output, 1).item()
                    emotion_label = class_labels[pred]

                # Store prediction for drawing later
                predictions.append((x1, y1, x2, y2, emotion_label))

        last_prediction_time = current_time

    # Draw bounding boxes and labels from last predictions
    for (x1, y1, x2, y2, label) in predictions:
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(frame, label, (x1, y1 - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

    cv2.imshow("Face Emotion Recognition", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()



0: 480x640 3 persons, 90.3ms
Speed: 1.7ms preprocess, 90.3ms inference, 115.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 21.0ms
Speed: 2.2ms preprocess, 21.0ms inference, 4.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 15.7ms
Speed: 1.6ms preprocess, 15.7ms inference, 3.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 20.2ms
Speed: 2.0ms preprocess, 20.2ms inference, 3.7ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 26.7ms
Speed: 2.5ms preprocess, 26.7ms inference, 11.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 cup, 15.6ms
Speed: 1.5ms preprocess, 15.6ms inference, 2.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 cup, 23.3ms
Speed: 2.8ms preprocess, 23.3ms inference, 5.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 16.9ms
Speed: 2.3ms preprocess, 16.9ms inference, 2.8ms postprocess per image

KeyboardInterrupt: 