VIDEOS: https://www.storyblocks.com/video/search/celebrity

In [1]:
from ultralytics import YOLO
import torch
import PIL
import numpy as np
import math
from PIL import Image
import cv2
import pandas as pd
from tqdm import tqdm
import ast
import faiss

from src.neural_nets import *

#### Предобработка изображения (детекция + выравнивание)

In [2]:
class ImageProcessing:
    def __init__(self) -> None:
        self.face_detector = YOLO('./yolov8n-face.pt')

    #
    def get_central_of_face(self, face_info):
        left_eye, right_eye, nose = face_info['key_points'][:3]
        center_x = (left_eye[0] + right_eye[0] + nose[0]) // 3
        center_y = (left_eye[1] + right_eye[1] + nose[1]) // 3
        face_center_point = [center_x, center_y]
        return face_center_point

    #
    def detect_faces(self, image):
        res = self.face_detector(image, show=False, save=False, conf=0.4,
                                 save_txt=False, save_crop=False, verbose=False)[0]
        
        faces = []
        for ind in range(len(res.boxes)):
            box_points = res.boxes.xyxy[ind].cpu().numpy().astype(int)
            keypoints = res.keypoints[ind].data.cpu().numpy()[0][:, :2].astype(int)
            tmp_face = {
                "conf": res.boxes.conf[ind].cpu().item(),
                "box_points": box_points.tolist(),
                "key_points": keypoints.tolist(),
                }
            
            f_center = self.get_central_of_face(tmp_face)
            tmp_face['face_center'] = f_center
            faces.append(tmp_face)

        return faces

    #
    def get_modif_image(self, image):
        faces_info = self.detect_faces(image)

        img_copy = image.copy()
        draw = PIL.ImageDraw.Draw(img_copy)
        p_wdth = 4

        for face in faces_info:
            box, kp = face['box_points'], face['key_points']
            
            draw.rectangle(box, width=2, outline='red')
            
            points = kp + [face['face_center']]

            colors = ['red', 'blue', 'green', 'orange', 'black', 'brown']
            for i, p in enumerate(points):
                draw.ellipse([p[0]-p_wdth, p[1]-p_wdth,
                              p[0]+p_wdth, p[1]+p_wdth], fill=colors[i])

        return img_copy

    #
    def crop_face(self, image, face):
        box_points, f_center = face['box_points'], face['face_center']

        max_sz = max(
            abs(box_points[0] - f_center[0]),
            abs(box_points[1] - f_center[1]),
            abs(box_points[2] - f_center[0]),
            abs(box_points[3] - f_center[1]),
        )

        new_bbox = (f_center[0] - max_sz, f_center[1] - max_sz,
                    f_center[0] + max_sz, f_center[1] + max_sz)
        
        return image.crop(new_bbox)

    #
    def align_face(self, face, face_info):
        left_eye, right_eye = face_info['key_points'][:2]
        dist = lambda p1, p2: np.sqrt((p1[0] - p2[0])**2 + (p1[1] - p2[1])**2)

        #print(left_eye, right_eye)

        #
        c = dist(left_eye, right_eye)

        # finding rotation direction
        if left_eye[1] > right_eye[1]:
            point_3rd = (right_eye[0], left_eye[1])
            a = dist(left_eye, point_3rd)
            direction = -1
            cos_ang = a / c
        else:
            point_3rd = (left_eye[0], right_eye[1])
            b = dist(right_eye, point_3rd)
            direction = 1
            cos_ang = b / c 
        
        #cos_a = (b*b + c*c - a*a)/(2*b*c)
        angle = ((np.arccos(cos_ang) * 180) / math.pi)

        #print(angle)

        #
        return face.rotate(angle*direction)

    #
    def get_faces_from_image(self, image):
        # detect faces
        faces_info = self.detect_faces(image)
        
        # crop faces
        croped_faces = [self.crop_face(image, info) for info in faces_info]

        # center faces
        centered_faces = [self.align_face(face, info) for face, info in zip(croped_faces, faces_info)]
        
        # resize faces
        resized_faces = [Image.fromarray(cv2.resize(np.array(face), (112,112))) for face in centered_faces]
        
        return resized_faces, faces_info
    
#inference = ImageProcessing()

In [180]:
# TEST

test_img_path = '/home/dzigen/Desktop/ITMO/sem1/ImgGen/Лабы/task2/data/pictures/Aaron_Eckhart_0001.jpg'
img = PIL.Image.open(test_img_path)

res = inference.get_faces_from_image(img)

#inference.get_modif_image(img)

#### Экстрактор Фичей

In [181]:
F_MODEL_PATH = '/home/dzigen/Desktop/ITMO/sem1/ImgGen/Лабы/task2/logs/8/best_model.pt'

In [3]:
class FaceEmbedder:
    def __init__(self, model_path, backbone_name, device) -> None:
        self.model = EmbedderNet(EMBED_SIZE, backbone_name).to(device)
        self.preprocessor = AutoImageProcessor.from_pretrained(VIT_PATH if backbone_name == 'vit' else EFFICIENTNET_PATH)
        self.device = device

        ckpt = torch.load(model_path, map_location=device)
        self.model.load_state_dict(ckpt)
        self.model.eval()

    def get_emb(self, face):
        face_tensor = torch.unsqueeze(torch.tensor(self.preprocessor(face)['pixel_values'][0]), 0).to(self.device)
        face_emb = self.model(face_tensor).detach().cpu()
        return face_emb
    
#embedder = FaceEmbedder(F_MODEL_PATH, 'eff', 'cuda')

In [183]:
# TEST

image = Image.open("/home/dzigen/Desktop/ITMO/sem1/ImgGen/Лабы/task2/data/pictures/73023_v9_ba.jpg")
faces, faces_info = inference.get_faces_from_image(image)

In [184]:
embedder.get_emb(faces[0]).shape

torch.Size([1, 512])

#### База данных лиц

In [60]:
class PersonDataBase:
    def __init__(self, embd_size, embedder, compare_metric, threshold) -> None:
        self.cmp_m = compare_metric
        self.thrhld = threshold
        self.embedder = embedder
        self.embd_s = embd_size

    #
    def load_dump(self, dump_file):
        self.dump = pd.read_csv(dump_file, sep=';')
        self.dump['image_embd'] = self.dump['image_embd'].apply(lambda x: ast.literal_eval(x))

        if self.cmp_m == 'cosine':
            self.index = faiss.IndexFlatIP(self.embd_s)
        elif self.cmp_m == 'euclid':
            self.index = faiss.IndexFlatL2(self.embd_s)

        vectors = np.array(self.dump['image_embd'].to_list())
        self.index.add(vectors)

    @torch.inference_mode()
    def norm_single(self, embed):
        # Нормировка эмбеддинга
        return embed / torch.linalg.norm(embed)

    #
    def make_dump(self, images_file, dump_file):
        images_info = pd.read_csv(images_file, sep=';')

        tmp_dump = []
        for i in tqdm(range(images_info.shape[0])):
            cur_img_path = f"./{images_info['relative_path'][i]}/{images_info['image_name'][i]}"
            cur_img_label = images_info['label'][i]

            image = Image.open(cur_img_path)
            image_tensor = self.embedder.get_emb(image)[0]
            image_norm = self.norm_single(image_tensor)
            tmp_dump.append((image_norm.tolist(), cur_img_label))

        dump = pd.DataFrame(tmp_dump, columns=['image_embd', 'label'])
        dump.to_csv(dump_file, sep=';', index=False)

    #
    def recognize(self, face):
        face_tensor = self.embedder.get_emb(face)
        image_norm = self.norm_single(face_tensor)

        #print(image_norm.shape)

        dist, ann = self.index.search(image_norm, k=1)

        person = self.dump['label'][ann[0][0]]
        sim_score = 1 - dist[0][0] if self.cmp_m == 'cosine' else dist[0][0]

        recognized_person = person if sim_score <= self.thrhld else "Unknown"

        return (recognized_person, round(sim_score,2))

In [35]:
# TEST

base = PersonDataBase(512, embedder, 'cosine', 0.19)

# base.make_dump("data/images_info.csv", "data/dump_info.csv")
base.load_dump("data/dump_info.csv")

In [36]:
img = Image.open("data/pictures/Aaron_Eckhart_0001.jpg")
faces, faces_info = inference.get_faces_from_image(img)
print(base.recognize(faces[0]))

('Aaron_Eckhart', 0.95)


#### Распознавание лиц (на картинках, на видео)

In [61]:
class FaceDetector:
    def __init__(self, processor, database) -> None:
        self.processor = processor
        self.person_db = database
        self.text_font = PIL.ImageFont.truetype("MontserratBlack-3zOvZ.ttf", size=20)
    #
    def detect_image(self, input_file, output_file="output.jpg"):
        img = Image.open(input_file)
        upd_img = self.detect_frame(img)
        upd_img.save(output_file)

    #
    def update_frame(self, frame, faces_info, persons_info):
        frame_copy = frame.copy()
        draw = PIL.ImageDraw.Draw(frame_copy)
        p_wdth = 4
        
        for f_info, p_info in zip(faces_info, persons_info):
            #print(p_info)
            box, kp = f_info['box_points'], f_info['key_points']
            
            draw.rectangle(box, width=2, outline='red')
            
            points = kp + [f_info['face_center']]

            colors = ['red', 'blue', 'green', 'orange', 'black', 'brown']
            for i, p in enumerate(points):
                draw.ellipse([p[0]-p_wdth, p[1]-p_wdth,
                              p[0]+p_wdth, p[1]+p_wdth], fill=colors[i])

            draw.text((box[0]-20, box[1]-20), f"{p_info[0]}({str(p_info[1])})", fill='yellow',
                       align ="left", font=self.text_font)

        return frame_copy

    #
    def detect_frame(self, frame):
        # detect faces
        faces, faces_info = self.processor.get_faces_from_image(frame)

        # find relative persons in datastore
        persons_info = [self.person_db.recognize(face) for face in faces]

        # update frame
        upd_frame = self.update_frame(frame, faces_info, persons_info)

        return upd_frame

    #
    def detect_video(self, input_file, output_file='output.avi'):
        cap = cv2.VideoCapture(input_file) 

        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH) + 0.5)
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) + 0.5)
        writer = cv2.VideoWriter(output_file, cv2.VideoWriter_fourcc(*"MJPG"), 30, (width, height))

        length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        print("Video frames amount: ",length)

        for  _ in tqdm(range(length)):
            ret, frame = cap.read()
    
            if not ret:
                break

            frame = Image.fromarray(frame)
            upd_frame = self.detect_frame(frame)

            # write frame
            writer.write(np.array(upd_frame))

            #if cv2.waitKey(1) & 0xFF == ord('q'):
            #    break

        writer.release()
        cap.release()
        #cv2.destroyAllWindows()


In [72]:
F_MODEL_PATH = '/home/dzigen/Desktop/ITMO/sem1/ImgGen/Лабы/task2/logs/14/best_model.pt'

In [90]:
inference = ImageProcessing()
embedder = FaceEmbedder(F_MODEL_PATH, 'vit', 'cuda')
base = PersonDataBase(512, embedder, 'cosine', 0.84)

Some weights of ViTModel were not initialized from the model checkpoint at jayanta/vit-base-patch16-224-in21k-face-recognition and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [74]:
base.make_dump("data/images_info.csv", "data/dump_info_14.csv")

  0%|          | 0/13233 [00:00<?, ?it/s]

100%|██████████| 13233/13233 [09:15<00:00, 23.84it/s]


In [91]:
base.load_dump("data/dump_info_14.csv")

In [50]:
#base.thrhld = 0.90

In [92]:
img_path = '/home/dzigen/Desktop/ITMO/sem1/ImgGen/Лабы/task2/data/pictures/Aaron_Eckhart_0001.jpg'
video_path = "data/videos/test.mp4"

In [93]:
face_detector = FaceDetector(inference, base)

In [94]:
face_detector.detect_image(img_path)

In [95]:
face_detector.detect_video(video_path)

Video frames amount:  586


100%|██████████| 586/586 [01:26<00:00,  6.80it/s]
