In [4]:
#!pip install ultralytics


In [None]:
import cv2
import torch
import numpy as np
from PIL import Image
from torchvision import transforms
from torchvision.models import mobilenet_v2
from ultralytics import YOLO

In [None]:
# Gerät für Inferenz
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Klassifikationsmodell laden
model = mobilenet_v2(pretrained=False)
model.classifier[1] = torch.nn.Linear(model.last_channel, 2)
model.load_state_dict(torch.load('catid_mobilenetv2.pt', map_location=device))
model.eval()
model.to(device)

# Preprocessing für Klassifikation
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])

# YOLOv8-Detect für Katzen-Detektion
yolo = YOLO('yolov8n.pt')  # COCO-Modell enthält Klasse 'cat'

# Mapping der Klassenausgabe
cmp = {0: 'Elsa', 1: 'Fabius'}

# Video-Stream von Webcam öffnen
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    raise RuntimeError("Webcam kann nicht geöffnet werden")

In [None]:
print("Starte Live-Stream. Drücke 'q' zum Beenden.")
while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Frame für YOLO (RGB!)
    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = yolo(img)[0]

    # Durch alle Bounding Boxes iterieren
    for box in results.boxes:
        cls_id = int(box.cls.cpu().numpy()[0])
        # COCO Klasse 15 = 'cat'
        if cls_id == 15:
            x1, y1, x2, y2 = map(int, box.xyxy.cpu().numpy()[0])
            crop = frame[y1:y2, x1:x2]

            # Klassifikation
            pil_img = Image.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
            input_t = preprocess(pil_img).unsqueeze(0).to(device)
            with torch.no_grad():
                out = model(input_t)
                pred = out.argmax(dim=1).item()
            label = cmp[pred]

            # Bounding Box & Label einzeichnen
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(frame, label, (x1, y1 - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

    # Frame anzeigen
    cv2.imshow('CatFeederAI', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Aufräumen
cap.release()
cv2.destroyAllWindows()




Starte Live-Stream. Drücke 'q' zum Beenden.

0: 480x640 1 person, 64.7ms
Speed: 2.3ms preprocess, 64.7ms inference, 1.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 78.4ms
Speed: 1.6ms preprocess, 78.4ms inference, 1.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 67.1ms
Speed: 1.5ms preprocess, 67.1ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 74.1ms
Speed: 1.5ms preprocess, 74.1ms inference, 1.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 snowboard, 63.1ms
Speed: 1.3ms preprocess, 63.1ms inference, 1.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 67.2ms
Speed: 1.6ms preprocess, 67.2ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 64.5ms
Speed: 1.3ms preprocess, 64.5ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 59.3ms
Speed: 1.2ms preprocess, 59.3ms 