In [1]:
import cv2
import torch
import joblib
import numpy as np
from ultralytics import YOLO
from torchvision import transforms
from facenet_pytorch import InceptionResnetV1

In [2]:
yolo = YOLO("yolov8n-face.pt")
facenet = InceptionResnetV1(pretrained='vggface2').eval()
facenet.eval()

InceptionResnetV1(
  (conv2d_1a): BasicConv2d(
    (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU()
  )
  (conv2d_2a): BasicConv2d(
    (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU()
  )
  (conv2d_2b): BasicConv2d(
    (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU()
  )
  (maxpool_3a): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2d_3b): BasicConv2d(
    (conv): Conv2d(64, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(80, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU()
  )
  (conv2d_4a): 

In [3]:
clf = joblib.load("svm_face_recognizer.pkl")
le = joblib.load("label_encoder.pkl")  

In [4]:
def get_embedding(face_img):
    face_img = cv2.resize(face_img, (160, 160))
    face_img = face_img.astype(np.float32) / 255.0
    face_tensor = torch.tensor(face_img).permute(2, 0, 1).unsqueeze(0)
    with torch.no_grad():
        embedding = facenet(face_tensor).numpy()[0]
    return embedding

In [5]:
import uuid
from collections import defaultdict

#Intersection over Union.
def iou(box1, box2):
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])

    inter_area = max(0, x2 - x1) * max(0, y2 - y1)
    box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
    box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
    union_area = box1_area + box2_area - inter_area

    return inter_area / union_area if union_area != 0 else 0

tracked_faces = {} 
next_id = 0
iou_threshold = 0.6
prob_threshold = 0.6

In [10]:
cap = cv2.VideoCapture('samplefootage.mp4')

In [7]:
while True:
    ret, frame = cap.read()
    if not ret:
        break

    results = yolo(frame)
    new_tracked_faces = {}

    for box in results[0].boxes.xyxy:
        x1, y1, x2, y2 = map(int, box)
        face_crop = frame[y1:y2, x1:x2]

        if face_crop.shape[0] < 30 or face_crop.shape[1] < 30:
            continue

        embedding = get_embedding(face_crop)
        pred = clf.predict([embedding])[0]
        name = le.inverse_transform([pred])[0]
        prob = clf.predict_proba([embedding])[0][pred]
        
        matched_id = None
        for fid, data in tracked_faces.items():
            if iou((x1, y1, x2, y2), data["bbox"]) > iou_threshold:
                matched_id = fid
                break

        if matched_id is None:
            matched_id = str(uuid.uuid4())
            label = name if prob > prob_threshold else "Unknown"
        else:
            if prob > prob_threshold:
                tracked_faces[matched_id]["label"] = name
                tracked_faces[matched_id]["prob"] = prob
            label = tracked_faces[matched_id]["label"]

        new_tracked_faces[matched_id] = {
            "bbox": (x1, y1, x2, y2),
            "label": label,
            "prob": prob
        }

        if prob > prob_threshold:
            label = f"{name} ({prob:.2f})"
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(frame, label, (x1, y1 - 10),
                        cv2.FONT_HERSHEY_DUPLEX, 0.5, (0, 255, 0), 1)
        else:
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(frame, label, (x1, y1 - 10),
                        cv2.FONT_HERSHEY_DUPLEX, 0.5, (0, 255, 0), 1)
            

    tracked_faces = new_tracked_faces

    cv2.imshow("Face Recognition", frame)
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()


0: 384x640 3 faces, 103.6ms
Speed: 3.5ms preprocess, 103.6ms inference, 69.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 faces, 7.5ms
Speed: 2.1ms preprocess, 7.5ms inference, 2.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 faces, 8.4ms
Speed: 1.8ms preprocess, 8.4ms inference, 2.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 faces, 7.3ms
Speed: 2.1ms preprocess, 7.3ms inference, 2.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 faces, 6.9ms
Speed: 2.2ms preprocess, 6.9ms inference, 2.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 faces, 8.8ms
Speed: 2.1ms preprocess, 8.8ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 faces, 6.7ms
Speed: 1.9ms preprocess, 6.7ms inference, 2.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 faces, 8.2ms
Speed: 2.1ms preprocess, 8.2ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640

In [11]:
use_hist_eq = False

while True:
    ret, frame = cap.read()
    if not ret:
        break

    original_frame = frame.copy()

    if use_hist_eq:
        ycrcb = cv2.cvtColor(frame, cv2.COLOR_BGR2YCrCb)
        ycrcb[:, :, 0] = cv2.equalizeHist(ycrcb[:, :, 0])
        frame = cv2.cvtColor(ycrcb, cv2.COLOR_YCrCb2BGR)

    results = yolo(frame, verbose=False)
    new_tracked_faces = {}

    for box in results[0].boxes.xyxy:
        x1, y1, x2, y2 = map(int, box)
        face_crop = frame[y1:y2, x1:x2]

        if face_crop.shape[0] < 30 or face_crop.shape[1] < 30:
            continue

        embedding = get_embedding(face_crop)
        pred = clf.predict([embedding])[0]
        name = le.inverse_transform([pred])[0]
        prob = clf.predict_proba([embedding])[0][pred]

        matched_id = None
        for fid, data in tracked_faces.items():
            if iou((x1, y1, x2, y2), data["bbox"]) > iou_threshold:
                matched_id = fid
                break

        if matched_id is None:
            matched_id = str(uuid.uuid4())
            label = name if prob > prob_threshold else "Unknown"
        else:
            if prob > prob_threshold:
                tracked_faces[matched_id]["label"] = name
                tracked_faces[matched_id]["prob"] = prob
            label = tracked_faces[matched_id]["label"]

        new_tracked_faces[matched_id] = {
            "bbox": (x1, y1, x2, y2),
            "label": label,
            "prob": prob
        }

        label_display = f"{label} ({prob:.2f})" if prob > prob_threshold else label
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(frame, label_display, (x1, y1 - 10),
                    cv2.FONT_HERSHEY_DUPLEX, 0.5, (0, 255, 0), 1)

    tracked_faces = new_tracked_faces

    cv2.imshow("Face Recognition", frame)
    key = cv2.waitKey(1) & 0xFF
    if key == ord("q"):
        break
    elif key == ord("h"):
        use_hist_eq = not use_hist_eq
        print("Histogram Equalization:", "ON" if use_hist_eq else "OFF")

cap.release()
cv2.destroyAllWindows()


Histogram Equalization: ON
Histogram Equalization: OFF
Histogram Equalization: ON
Histogram Equalization: OFF
Histogram Equalization: ON
Histogram Equalization: OFF
Histogram Equalization: ON
Histogram Equalization: OFF
Histogram Equalization: ON
Histogram Equalization: OFF
Histogram Equalization: ON
Histogram Equalization: OFF
Histogram Equalization: ON
Histogram Equalization: OFF
Histogram Equalization: ON
Histogram Equalization: OFF
Histogram Equalization: ON
Histogram Equalization: OFF
Histogram Equalization: ON
Histogram Equalization: OFF
Histogram Equalization: ON
Histogram Equalization: OFF
Histogram Equalization: ON
Histogram Equalization: OFF
Histogram Equalization: ON
