# Face recognition siamese model

## Imports

In [None]:
import os

import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.utils.data import Subset
from torch.utils.data import random_split

from torchvision import datasets
from torchvision.transforms import ToTensor, Lambda, Compose, RandomHorizontalFlip, Resize

from torchdata.datapipes.iter import Zipper, IterableWrapper

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

## Data pre-processing

In [None]:
ROOT_PATH = os.path.join('data')
POS_PATH = os.path.join('data', 'positive')
NEG_PATH = os.path.join('data', 'negative')
ANC_PATH = os.path.join('data', 'anchor')

In [None]:
img_transforms = Compose([
    RandomHorizontalFlip(),
    ToTensor(),
    Resize((105, 105)),
])
full_dataset: datasets.ImageFolder = datasets.ImageFolder(root=ROOT_PATH, transform=img_transforms)

dataset = [img_paths, labels]

Labels:
- anchor = 0
- positive = 1
- negative = 2

In [None]:
# array of booleans of where the anchor, positive, and negative images are in the dataset
is_anchor : bool = torch.tensor(full_dataset.targets) == 0
is_negative :bool = torch.tensor(full_dataset.targets) == 1
is_positive : bool = torch.tensor(full_dataset.targets) == 2

# extract the anchor, positive, and negative img indices
anchor_indices : torch.Tensor = is_anchor.nonzero().flatten()
negative_indices : torch.Tensor = is_negative.nonzero().flatten()
positive_indices : torch.Tensor = is_positive.nonzero().flatten()

# create the anchor, positive, and negative datasets
anchor_dataset : Subset = Subset(full_dataset, anchor_indices)
negative_dataset : Subset = Subset(full_dataset, negative_indices)
positive_dataset : Subset = Subset(full_dataset, positive_indices)

Now, the datasets are [(img, label), (img, label), (img, label), ...]. We need them to be [img, img, img, ...] only, no label needed since they are already split by label.

In [None]:
# WARNING: this takes about 1.5 minutes to run, please only run it once
anchor_dataset : list = [sublist[0] for sublist in list(anchor_dataset)]
negative_dataset : list = [sublist[0] for sublist in list(negative_dataset)]
positive_dataset : list = [sublist[0] for sublist in list(positive_dataset)]

In [None]:
# zip the anchor, positive, and negative datasets together
zipped_pos_dataset : list = Zipper(IterableWrapper(anchor_dataset), IterableWrapper(positive_dataset), IterableWrapper(torch.ones(len(anchor_dataset))))
zipped_neg_dataset : list = Zipper(IterableWrapper(anchor_dataset), IterableWrapper(negative_dataset), IterableWrapper(torch.zeros(len(anchor_dataset))))

zipped_pos_dataset : list = list(zipped_pos_dataset)
zipped_neg_dataset : list = list(zipped_neg_dataset)

Now the `zipped_pos_dataset` has the format: `[(anchor, positive, 1), (anchor, positive, 1), ...]`. Here the 1 is the label of the pair, which signifies that the image is positive, meaning from the same person as anchor. So the `zipped_neg_dataset` has 0s instead of 1s and negative images instead of positive.

In [None]:
# Combine the positive and negative datasets and shuffle them
final_dataset : list = zipped_pos_dataset + zipped_neg_dataset
np.random.shuffle(final_dataset)

## Data loading

In [None]:
batch_size = 64

# split between training and testing 80-20
train_set, test_set = random_split(final_dataset, [int(len(final_dataset) * 0.8), len(final_dataset) - int(len(final_dataset) * 0.8)])
train_dataloader : DataLoader = DataLoader(train_set, batch_size=batch_size)
test_dataset : DataLoader = DataLoader(test_set, batch_size=batch_size)

len(dataloader) = number of batches = total imgs in final_dataset / batch_size

Data is organized inside `dataloader` as follows: [anchors, pos/neg imgs, label]
- img is of shape [3, 224, 224], since there is _batch_size_ (currently 64) images in a batch, the shape of anc/pos/neg imgs is [64, 3, 224, 224]
- label is either 0 or 1: 0 for negative, 1 for positive

Note: run the next block to confirm

In [None]:
first_batch = train_dataloader._get_iterator().__next__()

# N = batch size, C = color channels, H = height, W = width
print("Shape of data [N, C, H, W]: ", first_batch[0].shape)
print("Shape of labels: ", first_batch[2].shape, first_batch[1].dtype)

### Visualizing the data for debugging

Here is an example of how to visualize an image

In [None]:
# first image is at first_batch[0][0]
# we need to change tesor ordering for plt.imshow using permute
plt.imshow(first_batch[0][0].permute(1, 2, 0))

# How to access different batches:
# it = train_dataloader._get_iterator()
# first_batch = it._next_data()
# second_second = it._next_data()

## Model

### Embedding layer

In [33]:
# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))

Using cpu device


In [34]:
class EmbeddingNetwork(nn.Module):
    def __init__(self):
        super(EmbeddingNetwork, self).__init__()
        
        # layers
        # first layer: 3 input channels, 64 output channels
        self.l1 = nn.Conv2d(3, 64, 10, padding=1)
        self.a1 = nn.ReLU()
        self.p1 = nn.MaxPool2d(2)
        
        # second layer: 64 input channels, 128 output channels
        self.l2 = nn.Conv2d(64, 128, 7, padding=1)
        self.a2 = nn.ReLU()
        self.p2 = nn.MaxPool2d(2)
        
        # third layer: 128 input channels, 128 output channels
        self.l3 = nn.Conv2d(128, 128, 4, padding=1)
        self.a3 = nn.ReLU()
        self.p3 = nn.MaxPool2d(2)
        
        # fourth layer: 128 input channels, 256 output channels
        self.l4 = nn.Conv2d(128, 256, 4, padding=1)
        self.a4 = nn.ReLU()
        self.p4 = nn.Flatten()
        
        # dense layer: 256 input channels, 4096 output channels
        self.l5 = nn.Linear(256, 4096)
        self.a5 = nn.Sigmoid()

    def forward(self, x):
        """Pass the input tensor through the embeddding network.

        Args:
            x: input tensor, 3 channels, 105x105 pixels

        Returns:
            torch.Tensor: output tensor, 4096 channels
        """
        # pass values through layers
        x = self.l1(x)
        x = self.a1(x)
        x = self.p1(x)
        x = self.l2(x)
        x = self.a2(x)
        x = self.p2(x)
        x = self.l3(x)
        x = self.a3(x)
        x = self.p3(x)
        x = self.l4(x)
        x = self.a4(x)
        x = self.p4(x)
        return x

### Siamese network

In [35]:
class SiameseNetwork(nn.Module):
    def __init__(self):
        super(SiameseNetwork, self).__init__()
        
        # embedding layer
        self.embedding_layer = EmbeddingNetwork()
        
        # fully connected classification layer
        self.classification_layer = nn.Linear(4096, 1)
        
    def forward(self, x_input, x_target):
        """Pass the input tensor through the siamese network.

        Args:
            x_input (torch.Tensor): input image (from webcam), 3 channels, 105x105 pixels
            x_target (torch.Tensor): target image (from database), 3 channels, 105x105 pixels

        Returns:
            torch.Tensor: output tensor, 1 channel
        """
        # pass through embedding layer
        x_input = self.embedding_layer(x_input)
        x_target = self.embedding_layer(x_target)
        
        # calculate the absolute difference between the two embeddings
        dist = torch.abs(x_input - x_target)
        
        # pass through fully connected classification layer
        x = self.classification_layer(dist)
        
        return x

### Loss and optimizer

In [None]:
model = SiameseNetwork().to(device)
# print(model)

In [None]:
# using cross entropy loss function and adam optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

## Training & testing