In [None]:
from ultralytics import YOLO
import cv2
import torch
import torchvision.transforms as transforms
import torch.nn as nn
import time

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load face detection YOLO model (replace with your face detection model path)
face_detector = YOLO("yolo11n.pt")  # pretrained face detection model

# Emotion classes
class_labels = ['Angry', 'Boring', 'Disgust', 'Fear', 'Happy', 'Neural', 'Sad', 'Stress', 'Suprise']

class EmotionCNN(nn.Module):
    def __init__(self, num_classes):
        super(EmotionCNN, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 32, 3, padding=1), nn.ReLU(),
            nn.BatchNorm2d(32),
            nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(),
            nn.BatchNorm2d(64),
            nn.MaxPool2d(2), nn.Dropout(0.25),

            nn.Conv2d(64, 128, 3, padding=1), nn.ReLU(),
            nn.BatchNorm2d(128),
            nn.MaxPool2d(2), nn.Dropout(0.25)
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128 * 16 * 16, 256), nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x


emotion_model = EmotionCNN(len(class_labels)).to(device)
emotion_model.load_state_dict(torch.load("emotion_model.pth", map_location=device))
emotion_model.eval()

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5])
])

cap = cv2.VideoCapture(0)

last_prediction_time = 0
prediction_interval = 4
predictions = [] 

while True:
    ret, frame = cap.read()
    if not ret:
        break

    current_time = time.time()
    if current_time - last_prediction_time >= prediction_interval:
        predictions = []  # Clear old predictions

        # Detect faces using YOLO face detector
        results = face_detector(frame)

        for result in results:
            for box in result.boxes:
                x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())

                # Crop face region
                face_img = frame[y1:y2, x1:x2]

                # Preprocess face for emotion recognition
                face_gray = cv2.cvtColor(face_img, cv2.COLOR_BGR2GRAY)
                face_resized = cv2.resize(face_gray, (64, 64))
                face_tensor = transform(face_resized).unsqueeze(0).to(device)

                with torch.no_grad():
                    output = emotion_model(face_tensor)
                    pred = torch.argmax(output, 1).item()
                    emotion_label = class_labels[pred]

                # Store prediction for drawing later
                predictions.append((x1, y1, x2, y2, emotion_label))

        last_prediction_time = current_time

    # Draw bounding boxes and labels from last predictions
    for (x1, y1, x2, y2, label) in predictions:
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(frame, label, (x1, y1 - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

    cv2.imshow("Face Emotion Recognition", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()
