In [6]:
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from facenet_pytorch import MTCNN, InceptionResnetV1
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
import torchvision.transforms as transforms

Note that InceptionResnetV1 is arguably one of the most primitive models. This is what ChatGPT says:

Beyond InceptionResNetV1, there are several variants of FaceNet models, each designed with different architectures and optimizations. FaceNet itself is a framework for face recognition, and it can be built on various network architectures. Some of the most well-known FaceNet variants include:

Inception-ResNet-V2: A more advanced version of InceptionResNetV1, this model improves performance through a better balance of Inception networks and residual connections.
ResNet-based FaceNet: Some variations of FaceNet are built on ResNet architectures, such as ResNet-50, ResNet-101, or ResNet-152. These models are designed to handle more complex datasets and have better feature extraction capabilities.
MobileNet-based FaceNet: Designed for efficiency, MobileNet architectures are lightweight models that are optimized for mobile and edge devices. MobileNet-based FaceNet models are designed for real-time applications with limited computational resources.
EfficientNet-based FaceNet: EfficientNet architectures provide high accuracy with fewer parameters. They are designed to be more efficient in terms of computation and memory, which can be important for deployment on resource-constrained devices.
VGG-based FaceNet: VGG networks have also been adapted for face recognition tasks. They are simpler but highly effective in many applications, especially when fine-tuned for specific tasks.
Siamese Networks: In some cases, FaceNet models are implemented as Siamese Networks (pairwise models) to enhance performance in one-shot learning scenarios, where the model needs to recognize faces with only a few examples.

In [10]:
class FacePairDataset(Dataset):
    def __init__(self, image1_paths, image2_paths, labels, transform=None):
        self.image1_paths = image1_paths
        self.image2_paths = image2_paths
        self.labels = labels
        self.transform = transform
        self.mtcnn = MTCNN(image_size=160, keep_all=True)

    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        image1 = Image.open(self.image1_paths[idx]).convert('RGB')
        image2 = Image.open(self.image2_paths[idx]).convert('RGB')
        label = self.labels[idx]

        # Detect faces and align them (using MTCNN)
        faces1, _ = self.mtcnn(image1, return_prob=True)
        faces2, _ = self.mtcnn(image2, return_prob=True)

        if faces1 is None or faces2 is None:
            raise ValueError('No faces detected in one or both images.')
        
        face1 = faces1[0]
        face2 = faces2[0]

        if self.transform:
            face1 = self.transform(face1)
            face2 = self.transform(face2)

        return (face1, face2), torch.tensor(label, dtype=torch.float32)
    

In [8]:
class ContrastiveLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, output1, output2, label):
        # Compute the cosine similarity between the two embeddings
        similarity = F.cosine_similarity(output1, output2)
        # Contrastive loss function: penalize high similarity for different faces
        loss = torch.mean((1 - label) * torch.pow(similarity, 2) + (label) * torch.pow(torch.clamp(1 - similarity, min=0.0), 2))
        return loss
    

In [9]:
class FineTunedFaceNet(nn.Module):
    def __init__(self, base_model):
        super(FineTunedFaceNet, self).__init__()
        self.base_model = base_model
        # Add a fully connected layer to produce a fixed-size embedding vector (e.g. 128-dimensional)
        self.fc = nn.Linear(base_model.last_linear.in_features, 128)    # 128-dimensional embedding vector

    def forward(self, x):
        # Forward pass through the base model
        x = self.base_model(x)
        # Pass through a fully connected layer to get the final embedding
        x = self.fc(x)
        return x

In [None]:
# First load in a dataset
# Create combinations 


image1_paths = []
image2_paths = []
labels = [1, 0]

# Define the transformations
transform = transforms.Compose([
    transforms.Resize((160, 160)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Create the dataset and dataloaders
dataset = FacePairDataset(image1_paths, image2_paths, labels, transform)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Initialize the model and optimizer
base_model = InceptionResnetV1(pretrained='vggface2').eval()
model = FineTunedFaceNet

optimizer = Adam(model.parameters(), lr=0.0001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for (face1, face2), label in dataloader:
        optimizer.zero_grad()

        # Forward pass: get embeddings for the two face images
        embedding1 = model(face1)
        embedding2 = model(face2)

        # Compute the contrastive loss
        loss = ContrastiveLoss()(embedding1, embedding2, label)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {running_loss / len(dataloader)}')

In [None]:
# Save the trained model
torch.save(model.state_dict(), 'finetuned_facenet.pth')