
# Concepte și Aplicații în Vederea Artificială - Tema 2
### Detectarea și recunoașterea facială a personajelor din serialul de desene animate Laboratorul lui Dexter
<p style='text-align: right;'> Bucă Mihnea-Vicențiu </p>

### Paths

In [1]:
save_train_data = 'crops/'
cluster_train_data = 'clusters/'
path_train_data = '../antrenare/'
test_data = '../testare/' # path to test data
npy_response = '352_Mihnea-Vicentiu_Buca/'
npy_load_data = 'curr_data/'

### Libraries

In [2]:
import os
import cv2
import torch
import pickle
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torchvision.ops import nms
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from torchvision.transforms import ToTensor
from torch.utils.data import Dataset, DataLoader

In [3]:
import sys
import sklearn
print('Python version:', sys.version)
print('PyTorch version:', torch.__version__)
print('OpenCV version:', cv2.__version__)
print('Numpy version:', np.__version__)
print('Pickle version:', pickle.format_version)
print('Matplotlib version:', plt.matplotlib.__version__)
print('Sklearn version:', sklearn.__version__)

Python version: 3.13.1 (tags/v3.13.1:0671451, Dec  3 2024, 19:06:28) [MSC v.1942 64 bit (AMD64)]
PyTorch version: 2.7.0.dev20250118+cpu
OpenCV version: 4.10.0
Numpy version: 2.1.2
Pickle version: 4.0
Matplotlib version: 3.9.2
Sklearn version: 1.5.2


### +/- examples

In [4]:
def read_annotations(annotations_path):
    with open(annotations_path, 'r') as file:
        lines = file.readlines()
        classifications = []
        for line in lines:
            line = line.split()
            classifications.append({
                'image': line[0],
                'coordinates': tuple(map(int, line[1:5])),
                'character': line[5]
            })
            
    return classifications

def process_classification(classification, image_path):
    (x1, y1, x2, y2) = classification['coordinates']
    image = cv2.imread(image_path + classification['image'])
    character = classification['character']
    crop = cv2.resize(image[y1:y2, x1:x2], (64, 64))
    return crop, character

def area_bbox(bbox):
    return (bbox[2] - bbox[0] + 1) * (bbox[3] - bbox[1] + 1)

def intersection_over_union(bbox_a, bbox_b):
    x_a = max(bbox_a[0], bbox_b[0])
    y_a = max(bbox_a[1], bbox_b[1])
    x_b = min(bbox_a[2], bbox_b[2])
    y_b = min(bbox_a[3], bbox_b[3])

    inter_area = max(0, x_b - x_a + 1) * max(0, y_b - y_a + 1)

    box_a_area = area_bbox(bbox_a)
    box_b_area = area_bbox(bbox_b)

    union_area = float(box_a_area + box_b_area - inter_area)
    iou = inter_area / union_area if union_area != 0 else 0 
    return iou

def intersection_over_union_scaled(bbox_a, bbox_b, tresh_hold):
    iou = intersection_over_union(bbox_a, bbox_b)
    return iou <= tresh_hold * area_bbox(bbox_b) / area_bbox(bbox_a)

def get_path(folder, index):
    return save_train_data + folder + '/' + str(index) + '.jpg'

def run_training():
    curr_run = {
        'dad': 0,
        'mom': 0,
        'dexter': 0,
        'deedee': 0,
        'unknown': 0,
        'negative': 0
    }

    def clear_folders():
        for folder in ['dad', 'mom', 'dexter', 'deedee', 'unknown', 'negative']:
            for file in os.listdir(save_train_data + folder):
                os.remove(save_train_data + folder + '/' + file)

        for file in os.listdir(cluster_train_data):
            os.remove(cluster_train_data + file)

    def get_annotations():
        for name in ['dad', 'mom', 'dexter', 'deedee']:
            annotations_path = path_train_data + name + '_annotations.txt'
            image_path = path_train_data + name + '/'
            classifications = read_annotations(annotations_path)
            for classification in classifications:
                crop, character = process_classification(classification, image_path)
                cv2.imwrite(get_path(character, curr_run[character]), crop)
                curr_run[character] += 1

    def get_clusters():
        clusters = {
            'dad': [],
            'mom': [],
            'dexter': [],
            'deedee': [],
            'unknown': []
        }

        for name in ['dad', 'mom', 'dexter', 'deedee']:
            annotations_path = path_train_data + name + '_annotations.txt'
            image_path = path_train_data + name + '/'
            classifications = read_annotations(annotations_path)
            for classification in classifications:
                (x1, y1, x2, y2) = classification['coordinates']
                _, character = process_classification(classification, image_path)
                clusters[character].append(((x2 - x1 + 1) / (y2 - y1 + 1), y2 - y1 + 1))

        for character in clusters:
            kmeans = KMeans(n_clusters=5, random_state=0)
            kmeans.fit(clusters[character])
            pickle.dump(kmeans, open(cluster_train_data + character + '_kmeans.pkl', 'wb')) 

    def negative_annotations():
        for name in ['dad', 'mom', 'dexter', 'deedee']:
            annotations_path = path_train_data + name + '_annotations.txt'
            images_path = path_train_data + name + '/'

            all_characters_in_img = {}
            for classification in read_annotations(annotations_path):
                image_name = classification['image']
                (x1, y1, x2, y2) = classification['coordinates']

                if image_name not in all_characters_in_img:
                    all_characters_in_img[image_name] = []
                all_characters_in_img[image_name].append((x1, y1, x2, y2))
            

            for (image_name, rects) in all_characters_in_img.items():
                image = cv2.imread(images_path + image_name)

                for _ in ['dad', 'mom', 'dexter', 'deedee', 'unknown']:
                    kmeans = pickle.load(open(cluster_train_data + name + '_kmeans.pkl', 'rb'))
                    indices = np.random.choice(len(kmeans.cluster_centers_), 4, replace=False)
                    random_clusters = kmeans.cluster_centers_[indices]

                    for (aspect_ratio, height) in random_clusters:
                        height = int(height)
                        width = int(aspect_ratio * height)

                        if width > image.shape[1] or height > image.shape[0]:
                            continue

                        x1 = np.random.randint(0, image.shape[1] - width)
                        y1 = np.random.randint(0, image.shape[0] - height)
                        x2 = x1 + width
                        y2 = y1 + height

                        if x2 > image.shape[1] or y2 > image.shape[0]:
                            continue
                        
                        rect_are_good = True
                        for (x1_, y1_, x2_, y2_) in rects:
                            if not intersection_over_union_scaled((x1, y1, x2, y2), (x1_, y1_, x2_, y2_), 0.1):
                                rect_are_good = False
                                break
                        
                        if rect_are_good: 
                            crop = cv2.resize(image[y1:y2, x1:x2], (64, 64))
                            cv2.imwrite(get_path('negative', curr_run['negative']), crop)                             
                            curr_run['negative'] += 1
        
    def save_npy_data():
        for name in ['dad', 'mom', 'dexter', 'deedee', 'unknown', 'negative']:
            images = []
            print(save_train_data + name)
            for file in os.listdir(save_train_data + name):
                image = cv2.imread(save_train_data + name + '/' + file)
                images.append(image)

            images = np.array(images)
            np.save(npy_load_data + name + '_images.npy', images)

    clear_folders()
    get_clusters()
    get_annotations()
    negative_annotations()
    save_npy_data()

In [5]:
# run_training()

### Train CNN

In [6]:
class Dataset(Dataset):
    def __init__(self, positive_descriptors, negative_descriptors):
        self.data = []
        self.labels = []
        
        # Append positive samples with label 1
        for img in positive_descriptors:
            self.data.append(img)
            self.labels.append(1)
        
        # Append negative samples with label 0
        for img in negative_descriptors:
            self.data.append(img)
            self.labels.append(0)
        
        # Convert to tensors and normalize to [0, 1]
        self.data = torch.tensor(self.data, dtype=torch.float32) / 255.0
        self.data = self.data.permute(0, 3, 1, 2)
        self.labels = torch.tensor(self.labels, dtype=torch.float32)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

class DepthwiseSeparableConv(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, padding=1):
        super(DepthwiseSeparableConv, self).__init__()
        self.depthwise = nn.Conv2d(in_channels, in_channels, kernel_size, padding=padding, groups=in_channels)
        self.pointwise = nn.Conv2d(in_channels, out_channels, kernel_size=1)

    def forward(self, x):
        x = self.depthwise(x)
        x = self.pointwise(x)
        return x

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.model = nn.Sequential(
            DepthwiseSeparableConv(3, 32),
            nn.ReLU(),
            nn.BatchNorm2d(32),
            nn.MaxPool2d(2),

            DepthwiseSeparableConv(32, 64),
            nn.ReLU(),
            nn.BatchNorm2d(64),
            nn.MaxPool2d(2),

            DepthwiseSeparableConv(64, 128),
            nn.ReLU(),
            nn.BatchNorm2d(128),
            nn.MaxPool2d(2),

            nn.AdaptiveAvgPool2d(1),  # GAP
            nn.Flatten(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.model(x)
        return x

In [7]:
# train cnn network with data
def get_positive_descriptors(list_positives):
    positive_descriptors = []
    for name in list_positives:
        curr = np.load(npy_load_data + name + '_images.npy')
        positive_descriptors.extend(curr) 
    
    return positive_descriptors

def get_negative_descriptors(list_negatives):
    negative_descriptors = []
    for name in list_negatives:
        curr = np.load(npy_load_data + name + '_images.npy')
        negative_descriptors.extend(curr)
    
    return negative_descriptors

def train_classifier(name, list_positives, list_negatives):
    # Get data
    print("Loading data...")
    positive_descriptors = get_positive_descriptors(list_positives)
    negative_descriptors = get_negative_descriptors(list_negatives)
    print("Data loaded")

    # Create dataset and dataloader
    print("Creating dataset and dataloader...")
    dataset = Dataset(positive_descriptors, negative_descriptors)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
    
    # Initialize model, loss function, and optimizer
    print("Initializing model, loss function, and optimizer...")
    model = CNN()
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0001)
    
    # Training loop
    print("Training model...")
    epochs = 5
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in dataloader:
            # Zero the parameter gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(inputs).squeeze()
            loss = criterion(outputs, labels)
            
            # Backward pass and optimize
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(dataloader):.4f}")
    
    # Save the trained model
    torch.save(model.state_dict(), f'{name}_cnn.pth')
    print(f'Model saved as {name}_cnn.pth')

np.random.seed(0)
torch.manual_seed(0)
# train_classifier("all_faces_v2", ['dad', 'mom', 'dexter', 'deedee', 'unknown'], ['negative'])
# train_classifier("dad", ['dad'], ['mom', 'dexter', 'deedee', 'unknown', 'negative'])
# train_classifier("deedee", ['deedee'], ['dad', 'mom', 'dexter', 'unknown', 'negative'])
# train_classifier("dexter", ['dexter'], ['dad', 'mom', 'deedee', 'unknown', 'negative'])
# train_classifier("mom", ['mom'], ['dad', 'dexter', 'deedee', 'unknown', 'negative'])


<torch._C.Generator at 0x2417f63d3b0>

### Solver Functions

In [10]:
def is_completely_overlapped(box1, box2):
    x1, y1, x2, y2 = box1
    x3, y3, x4, y4 = box2

    return (x1 <= x3 and y1 <= y3 and x2 >= x4 and y2 >= y4) or \
              (x3 <= x1 and y3 <= y1 and x4 >= x2 and y4 >= y2)

def sliding_window(image, step_size, window_size):
    for y in range(0, image.shape[0] - window_size[1] + 1, step_size):
        for x in range(0, image.shape[1] - window_size[0] + 1, step_size):
            yield (x, y, image[y:y + window_size[1], x:x + window_size[0]])

def filter_boxes(best_results, iou_threshold=0.25):
    bboxes = [result[0] for result in best_results]
    scores = [result[1] for result in best_results]
    keep_indices = []
    
    for i, bbox1 in enumerate(bboxes):
        keep = True
        for j, bbox2 in enumerate(bboxes):
            if i != j and (intersection_over_union(bbox1, bbox2) > iou_threshold or is_completely_overlapped(bbox1, bbox2)):
                if scores[i] < scores[j]:
                    keep = False
                    break
        if keep:
            keep_indices.append(i)
    return [best_results[idx] for idx in keep_indices]

def detect_faces_best_aspect_ratios(test_data, model, model_name, cluster_train_data, task="task1", step_size=16, threshold=0.90):
    face_recognition_results = []
    
    # Preload all clusters
    clusters = {name: pickle.load(open(cluster_train_data + name + '_kmeans.pkl', 'rb'))
                for name in ['dad', 'mom', 'dexter', 'deedee', 'unknown']}


    all_cluster_centers = []
    for name, kmeans in clusters.items():
        all_cluster_centers.extend(kmeans.cluster_centers_)
    
    all_cluster_centers = np.vstack(all_cluster_centers)

    min_len = min(20, all_cluster_centers.shape[0])

    detections_for_model = []
    file_names_for_model = []
    scores_for_model = []

    for image_path in os.listdir(test_data):
        image = cv2.imread(test_data + image_path)
        best_results = []

        random_indices = np.random.choice(all_cluster_centers.shape[0], min_len, replace=False)

        aspect_ratios = all_cluster_centers[random_indices, 0]
        heights = all_cluster_centers[random_indices, 1]

        for aspect_ratio, height in zip(aspect_ratios, heights):
            window_height = int(height)
            window_width = int(aspect_ratio * height)
            window_size = (window_width, window_height)

            batch_windows = []
            batch_coords = []

            for x, y, window in sliding_window(image, step_size, window_size):
                if x + window_width > image.shape[1] or y + window_height > image.shape[0]:
                    continue

                resized_window = cv2.resize(window, (64, 64))
                resized_window = torch.tensor(resized_window, dtype=torch.float32) / 255.0
                resized_window = resized_window.permute(2, 0, 1)
                batch_windows.append(resized_window)
                batch_coords.append((x, y, x + window_width, y + window_height))

            if batch_windows:
                batch_windows = torch.stack(batch_windows)
                with torch.no_grad():
                    probs = model(batch_windows).squeeze().tolist()
                
                best_results.extend([(batch_coords[i], probs[i]) for i in range(len(batch_windows)) if probs[i] > threshold])
        
        if len(best_results) > 0:
            # Apply NMS
            bboxes, scores = zip(*best_results)
            bboxes = torch.tensor(bboxes, dtype=torch.float32)
            scores = torch.tensor(scores, dtype=torch.float32)
            indices = nms(bboxes, scores, 0.25)
            best_results = [(bboxes[idx].int().tolist(), scores[idx].item()) for idx in indices]

            # Filter results
            best_results = filter_boxes(best_results, 0.25)


            # Draw results
            for bbox, _ in best_results:
                detections_for_model.append(bbox)
                file_names_for_model.append(image_path)
                scores_for_model.append(_)

                # x1, y1, x2, y2 = bbox
                # cv2.rectangle(image, (x1, y1), (x2, y2), (0, 0, 255), 2)

            # face_recognition_results.append(image)

        print("Processed", image_path)
    
    np.save(npy_response + f'{task}/detections_{model_name}.npy', detections_for_model)
    np.save(npy_response + f'{task}/file_names_{model_name}.npy', file_names_for_model)
    np.save(npy_response + f'{task}/scores_{model_name}.npy', scores_for_model)
    # return face_recognition_results

### Task1

In [11]:
def detect_all_faces():
    print("Detecting all faces...")
    model = CNN()
    model.load_state_dict(torch.load("all_faces_cnn.pth"))
    model.eval()
    detect_faces_best_aspect_ratios(test_data, model, 'all_faces', cluster_train_data, threshold=0.90, task="task1")
   
detect_all_faces()

Detecting all faces...
Processed 001.jpg
Processed 002.jpg
Processed 003.jpg
Processed 004.jpg
Processed 005.jpg
Processed 006.jpg
Processed 007.jpg
Processed 008.jpg
Processed 009.jpg
Processed 010.jpg
Processed 011.jpg
Processed 012.jpg
Processed 013.jpg
Processed 014.jpg
Processed 015.jpg
Processed 016.jpg
Processed 017.jpg
Processed 018.jpg
Processed 019.jpg
Processed 020.jpg
Processed 021.jpg
Processed 022.jpg
Processed 023.jpg
Processed 024.jpg
Processed 025.jpg
Processed 026.jpg
Processed 027.jpg
Processed 028.jpg
Processed 029.jpg
Processed 030.jpg
Processed 031.jpg
Processed 032.jpg
Processed 033.jpg
Processed 034.jpg
Processed 035.jpg
Processed 036.jpg
Processed 037.jpg
Processed 038.jpg
Processed 039.jpg
Processed 040.jpg
Processed 041.jpg
Processed 042.jpg
Processed 043.jpg
Processed 044.jpg
Processed 045.jpg
Processed 046.jpg
Processed 047.jpg
Processed 048.jpg
Processed 049.jpg
Processed 050.jpg
Processed 051.jpg
Processed 052.jpg
Processed 053.jpg
Processed 054.jpg
Proce

### Task 2

In [12]:
def detect_dad():
    print("Scanning set for dad")
    model = CNN()
    model.load_state_dict(torch.load("dad_cnn.pth"))
    model.eval()
    detect_faces_best_aspect_ratios(test_data, model, 'dad', cluster_train_data, task="task2")

def detect_deedee():
    print("Scanning set for deedee")
    model = CNN()
    model.load_state_dict(torch.load("deedee_cnn.pth"))
    model.eval()
    detect_faces_best_aspect_ratios(test_data, model, 'deedee', cluster_train_data, task="task2")

def detect_dexter():
    print("Scanning set for dexter")
    model = CNN()
    model.load_state_dict(torch.load("dexter_cnn.pth"))
    model.eval()
    detect_faces_best_aspect_ratios(test_data, model, 'dexter', cluster_train_data, task="task2")

def detect_mom():
    print("Scanning set for mom")
    model = CNN()
    model.load_state_dict(torch.load("mom_cnn.pth"))
    model.eval()
    detect_faces_best_aspect_ratios(test_data, model, 'mom', cluster_train_data, task="task2")

detect_dad()
detect_deedee()
detect_dexter()
detect_mom()
print("Finished scanning images with all models")

Scanning set for dad
Processed 001.jpg
Processed 002.jpg
Processed 003.jpg
Processed 004.jpg
Processed 005.jpg
Processed 006.jpg
Processed 007.jpg
Processed 008.jpg
Processed 009.jpg
Processed 010.jpg
Processed 011.jpg
Processed 012.jpg
Processed 013.jpg
Processed 014.jpg
Processed 015.jpg
Processed 016.jpg
Processed 017.jpg
Processed 018.jpg
Processed 019.jpg
Processed 020.jpg
Processed 021.jpg
Processed 022.jpg
Processed 023.jpg
Processed 024.jpg
Processed 025.jpg
Processed 026.jpg
Processed 027.jpg
Processed 028.jpg
Processed 029.jpg
Processed 030.jpg
Processed 031.jpg
Processed 032.jpg
Processed 033.jpg
Processed 034.jpg
Processed 035.jpg
Processed 036.jpg
Processed 037.jpg
Processed 038.jpg
Processed 039.jpg
Processed 040.jpg
Processed 041.jpg
Processed 042.jpg
Processed 043.jpg
Processed 044.jpg
Processed 045.jpg
Processed 046.jpg
Processed 047.jpg
Processed 048.jpg
Processed 049.jpg
Processed 050.jpg
Processed 051.jpg
Processed 052.jpg
Processed 053.jpg
Processed 054.jpg
Process