In [2]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from torchvision import models, transforms
import torch
from ultralytics import YOLO
from sklearn.metrics import confusion_matrix, classification_report

In [3]:
data_yaml = r"C:\Users\abdal\Downloads\Compressed\Dog Detection Dataset.v4-dog-emotion-dataset-with-happy.yolov8\data.yaml"


In [4]:
model = YOLO("yolov8n.pt")  

results = model.train(
    data=data_yaml,
    epochs=50,
    imgsz=640,
    batch=16,
    device=0,   
    project="runs/dog_emotions",
    name="yolov8_exp",
    exist_ok=True
)

Ultralytics 8.3.192  Python-3.10.0 torch-2.5.1+cu121 CUDA:0 (NVIDIA GeForce RTX 3050 Laptop GPU, 4096MiB)
[34m[1mengine\trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=16, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=C:\Users\abdal\Downloads\Compressed\Dog Detection Dataset.v4-dog-emotion-dataset-with-happy.yolov8\data.yaml, degrees=0.0, deterministic=True, device=0, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=50, erasing=0.4, exist_ok=True, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=640, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.01, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train, model=yolov8n.pt, momentum=0.937, mosaic=1.0, multi_scale=False, name=yolov8_exp, nbs=64, nms=False, opset=None

In [5]:
metrics = model.val()
print(metrics)

Ultralytics 8.3.192  Python-3.10.0 torch-2.5.1+cu121 CUDA:0 (NVIDIA GeForce RTX 3050 Laptop GPU, 4096MiB)
Model summary (fused): 72 layers, 3,006,428 parameters, 0 gradients, 8.1 GFLOPs
[34m[1mval: [0mFast image access  (ping: 0.10.0 ms, read: 432.8262.0 MB/s, size: 75.7 KB)
[K[34m[1mval: [0mScanning C:\Users\abdal\Downloads\Compressed\Dog Detection Dataset.v4-dog-emotion-dataset-with-happy.yolov8\valid\labels.cache... 317 images, 0 backgrounds, 0 corrupt: 100% ━━━━━━━━━━━━ 317/317 316268.9it/s 0.0s
[K                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100% ━━━━━━━━━━━━ 20/20 5.1it/s 3.9s0.2s
                   all        317        326      0.942      0.885      0.941      0.885
            Aggressive         23         23      0.973      0.913      0.952      0.907
               Anxiety        137        138      0.977      0.944      0.987      0.947
                  Fear         76         77      0.932      0.895       0.94      0.87

In [6]:
test_img = r"C:\Users\abdal\Desktop\angry-dog-blog-1080x675-1.webp"

results = model(test_img, show=True, conf=0.5)
for r in results:
    boxes = r.boxes.xyxy.cpu().numpy()
    labels = r.boxes.cls.cpu().numpy()
    confs = r.boxes.conf.cpu().numpy()

    for box, label, conf in zip(boxes, labels, confs):
        print(f"Class: {model.names[int(label)]}, Confidence: {conf:.2f}")



image 1/1 C:\Users\abdal\Desktop\angry-dog-blog-1080x675-1.webp: 416x640 1 Anxiety, 60.3ms
Speed: 2.9ms preprocess, 60.3ms inference, 8.9ms postprocess per image at shape (1, 3, 416, 640)
Class: Anxiety, Confidence: 0.86


In [7]:
results = model("C:/Users/abdal/Desktop/angry-dog-blog-1080x675-1.webp", conf=0.5)

for r in results:
    annotated = r.plot()
    cv2.imwrite("detected_dog.jpg", annotated)
    cv2.imshow("Dog Detection", annotated)
    cv2.waitKey(0)
    cv2.destroyAllWindows()



image 1/1 C:\Users\abdal\Desktop\angry-dog-blog-1080x675-1.webp: 416x640 1 Anxiety, 25.2ms
Speed: 2.4ms preprocess, 25.2ms inference, 1.8ms postprocess per image at shape (1, 3, 416, 640)


In [8]:
def run_video(source=0):  
    cap = cv2.VideoCapture(source)
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        results = model(frame)
        annotated = results[0].plot()
        cv2.imshow("Dog Emotion Detection", annotated)
        if cv2.waitKey(1) & 0xFF == ord("q"):
            break
    cap.release()
    cv2.destroyAllWindows()


In [9]:
model.save(r"C:\Users\abdal\Desktop\model.pt")

In [10]:
# --- Configuration ---
# IMPORTANT: Replace with the actual path to your fine-tuned model.pt file
YOLO_MODEL_PATH = r"C:\Users\abdal\Desktop\model.pt"

# You can change the pooling method to 'average' or 'max'
TEMP_POOLING_METHOD = 'average'

In [11]:
# --- CELL 2: Initialize Models and Device ---

# 1. Device Configuration (Use GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 2. Load the fine-tuned YOLOv8 detection model
try:
    yolo_model = YOLO(YOLO_MODEL_PATH)
    print("YOLOv8 model loaded successfully.")
except Exception as e:
    print(f"Error loading YOLO model: {e}")

# 3. Load the pre-trained ResNet50 model for feature extraction
try:
    # Load the pre-trained model with default weights
    feature_model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
    
    # Remove the final classification layer to get the feature vector
    feature_model = torch.nn.Sequential(*(list(feature_model.children())[:-1]))
    
    # Set the model to evaluation mode and move it to the configured device
    feature_model.eval()
    feature_model.to(device)
    print("Feature extraction model (ResNet50) loaded successfully.")
except Exception as e:
    print(f"Error loading feature extraction model: {e}")

# 4. Define the image transformation pipeline for the feature extractor
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

Using device: cuda
YOLOv8 model loaded successfully.
Feature extraction model (ResNet50) loaded successfully.


In [12]:
# --- CELL 3: Define Helper Functions ---

def extract_cnn_features(patch_img, model):
    """Extracts features from an image patch using the pre-trained CNN model."""
    patch_img_rgb = cv2.cvtColor(patch_img, cv2.COLOR_BGR2RGB)
    input_image = Image.fromarray(patch_img_rgb)
    input_tensor = preprocess(input_image)
    input_batch = input_tensor.unsqueeze(0).to(device)
    
    with torch.no_grad():
        output = model(input_batch)
    
    return output.squeeze().cpu().numpy()

def temporal_pooling(frame_features, method='average'):
    """Aggregates frame-level features into a single video-level feature vector."""
    if not frame_features:
        return np.array([])
    
    if method == 'average':
        return np.mean(frame_features, axis=0)
    elif method == 'max':
        return np.max(frame_features, axis=0)
    else:
        raise ValueError(f"Unsupported temporal pooling method: {method}")

In [13]:
# --- CELL 4: Define the Main Video Processing Logic ---

def process_video_source(source):
    """Processes a video source (file path or camera index) to extract video features."""
    cap = cv2.VideoCapture(source)
    if not cap.isOpened():
        print(f"Error: Could not open video source: {source}")
        return None

    all_dog_features = []
    print("Processing video... Press 'q' in the display window to stop.")

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Run YOLOv8 detection
        results = yolo_model(frame, verbose=False)
        result = results[0]
        annotated_frame = result.plot() # Draws bounding boxes on the frame

        # Process each detected dog
        for box in result.boxes:
            x1, y1, x2, y2 = map(int, box.xyxy[0])
            dog_patch = frame[y1:y2, x1:x2]

            if dog_patch.shape[0] > 0 and dog_patch.shape[1] > 0:
                cnn_features = extract_cnn_features(dog_patch, feature_model)
                all_dog_features.append(cnn_features)

        # Display the output
        cv2.imshow("Dog Detection and Feature Extraction", annotated_frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    # After the loop, apply temporal pooling
    video_features = temporal_pooling(all_dog_features, method=TEMP_POOLING_METHOD)

    print("\n--- Processing Complete ---")
    if video_features.any():
        print("Final Video-Features vector created successfully.")
        print(f"Shape: {video_features.shape}")
        print("Features (first 10 values):", video_features[:10])
    else:
        print("No dogs were detected to extract features from.")

    cap.release()
    cv2.destroyAllWindows()
    return video_features

In [14]:
# --- CELL 5: Execute the Process ---

if __name__ == "__main__":
    choice = input("Choose usage:\n1: Input recorded video\n2: Open live camera\nEnter choice (1 or 2): ")
    
    if choice == '1':
        video_path = input("Enter the full path to the video file: ")
        # In notebooks, you might want to hardcode the path for easier testing
        # video_path = "path/to/your/test_video.mp4" 
        final_features = process_video_source(video_path)
    elif choice == '2':
        # Use 0 for the default camera
        final_features = process_video_source(0)
    else:
        print("Invalid choice. Please run the cell again and enter 1 or 2.")

Processing video... Press 'q' in the display window to stop.

--- Processing Complete ---
Final Video-Features vector created successfully.
Shape: (2048,)
Features (first 10 values): [   0.030409    0.015982    0.015829    0.039545    0.032448    0.023265    0.063498     0.18121    0.032631    0.019179]
