# PACKAGES LOADING

In [1]:
%load_ext autoreload

%autoreload 2

In [None]:
import bz2
import os
import numpy as np
import cv2
import matplotlib.pyplot as plt
import matplotlib.patches as patches
%matplotlib inline

from align import AlignDlib
import warnings
# Suppress LabelEncoder warning
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

from keras import backend as K
from keras.models import Model
from keras.layers import Input, Layer

# DATA LOADING

In [3]:
class IdentityMetadata():
    def __init__(self, base, name, file):
        # dataset base directory
        self.base = base
        # identity name
        self.name = name
        # image file name
        self.file = file

    def __repr__(self):
        return self.image_path()

    def image_path(self):
        return os.path.join(self.base, self.name, self.file) 
    
def load_metadata(path):
    metadata = []
    for i in sorted(os.listdir(path)):
        try:
            for f in sorted(os.listdir(os.path.join(path, i))):
                # Check file extension. Allow only jpg/jpeg' files.
                ext = os.path.splitext(f)[1]
                if ext == '.jpg' or ext == '.JPG' or ext == '.jpeg':
                    metadata.append(IdentityMetadata(path, i, f))
        except NotADirectoryError:
            pass
    return np.array(metadata)

metadata = load_metadata('images')

# FACE DETECTION

We will try Haar filter as part of Viola Jones algorithm. We load a *XML* file containing Haar file descriptors for facial detection

In [5]:
face_cascade = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')

img = cv2.imread(metadata[77].image_path())
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

In [None]:
faces = face_cascade.detectMultiScale(gray, 1.3, 5)
for (x,y,w,h) in faces:
    img = cv2.rectangle(img,(x,y),(x+w,y+h),(255,0,0),2)
    roi_gray = gray[y:y+h, x:x+w]
    roi_color = img[y:y+h, x:x+w]

cv2.imshow('img',img)
cv2.waitKey(0)
cv2.destroyAllWindows()

You can notice this method takes a lot of time for detecting face on a single image. We will bypass this method by using a quite similar method. This method is called landmarks detection on faces through **Face Alignment**. We set the position of outer eyes and nose

In [None]:
def load_image(path):
    img = cv2.imread(path, 1)
    # OpenCV loads images with color channels
    # in BGR order. So we need to reverse them
    return img[...,::-1]

# Initialize the OpenFace face alignment utility
alignment = AlignDlib('models/landmarks.dat')

# Load an image of Olivier Auliard
jc_orig = load_image(metadata[77].image_path())

# Detect face and return bounding box
bb = alignment.getLargestFaceBoundingBox(jc_orig)

# Transform image using specified face landmark indices and crop image to 96x96
jc_aligned = alignment.align(136, jc_orig, bb, landmarkIndices=AlignDlib.OUTER_EYES_AND_NOSE)

# Show original image
plt.subplot(131)
plt.imshow(jc_orig)

# Show original image with bounding box
plt.subplot(132)
plt.imshow(jc_orig)
plt.gca().add_patch(patches.Rectangle((bb.left(), bb.top()), bb.width(), bb.height(), fill=False, color='red'))

# Show aligned image
plt.subplot(133)
plt.imshow(jc_aligned);

As described in the OpenFace pre-trained models section, landmark indices `OUTER_EYES_AND_NOSE` are required for model nn4.small2.v1. Let's implement face detection, transformation and cropping as align_image function for later reuse.

In [None]:
def align_image(img):
    return alignment.align(96, img, alignment.getLargestFaceBoundingBox(img), 
                           landmarkIndices=AlignDlib.OUTER_EYES_AND_NOSE)

# FACE ENCODING

## Embedding vectors

In [None]:
nn4_small2_pretrained = create_model()
nn4_small2_pretrained.load_weights('weights/nn4.small2.v1.h5')

In [None]:
embedded = np.zeros((metadata.shape[0], 128))

for i, m in enumerate(metadata):
    print(m)
    img = load_image(m.image_path())
    img = align_image(img)
    try:
        # scale RGB values to interval [0,1]
        img = (img / 255.).astype(np.float32)
        # obtain embedding vector for image
        embedded[i] = nn4_small2_pretrained.predict(np.expand_dims(img, axis=0))[0]
    except TypeError:
        print(img)

Let's verify on a single triplet example that the squared L2 distance between its anchor-positive pair is smaller than the distance between its anchor-negative pair.

In [None]:
def distance(emb1, emb2):
    return np.sum(np.square(emb1 - emb2))

def show_pair(idx1, idx2):
    plt.figure(figsize=(8,3))
    plt.suptitle(f'Distance = {distance(embedded[idx1], embedded[idx2]):.2f}')
    plt.subplot(121)
    plt.imshow(load_image(metadata[idx1].image_path()))
    plt.subplot(122)
    plt.imshow(load_image(metadata[idx2].image_path()));    

show_pair(77, 76)
show_pair(77, 21)

As expected, the distance between the two images of Olivier Auliard is smaller than the distance between an image of Olivier Auliard and an image of Arthur Bossuet (0.30 < 1.12). But we still do not know what distance threshold $\tau$ is the best boundary for making a decision between *same identity* and *different identity*.

### Distance threshold

To find the optimal value for $\tau$, the face verification performance must be evaluated on a range of distance threshold values. At a given threshold, all possible embedding vector pairs are classified as either *same identity* or *different identity* and compared to the ground truth. Since we're dealing with skewed classes (much more negative pairs than positive pairs), we use the [F1 score](https://en.wikipedia.org/wiki/F1_score) as evaluation metric instead of [accuracy](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html).

In [None]:
from sklearn.metrics import f1_score, accuracy_score

distances = [] # squared L2 distance between pairs
identical = [] # 1 if same identity, 0 otherwise

num = len(metadata)

for i in range(num - 1):
    for j in range(1, num):
        distances.append(distance(embedded[i], embedded[j]))
        identical.append(1 if metadata[i].name == metadata[j].name else 0)
        
distances = np.array(distances)
identical = np.array(identical)

thresholds = np.arange(0.3, 1.0, 0.01)

f1_scores = [f1_score(identical, distances < t) for t in thresholds]
acc_scores = [accuracy_score(identical, distances < t) for t in thresholds]

opt_idx = np.argmax(f1_scores)
# Threshold at maximal F1 score
opt_tau = thresholds[opt_idx]
# Accuracy at maximal F1 score
opt_acc = accuracy_score(identical, distances < opt_tau)

# Plot F1 score and accuracy as function of distance threshold
plt.plot(thresholds, f1_scores, label='F1 score');
plt.plot(thresholds, acc_scores, label='Accuracy');
plt.axvline(x=opt_tau, linestyle='--', lw=1, c='lightgrey', label='Threshold')
plt.title(f'Accuracy at threshold {opt_tau:.2f} = {opt_acc:.3f}');
plt.xlabel('Distance threshold')
plt.legend();

# FACE IDENTIFICATION

In [None]:
targets = np.array([m.name for m in metadata])

encoder = LabelEncoder()
encoder.fit(targets)

# Numerical encoding of identities
y = encoder.transform(targets)

train_idx = np.arange(metadata.shape[0]) % 2 != 0
test_idx = np.arange(metadata.shape[0]) % 2 == 0

# 50 train examples of 10 identities (5 examples each)
X_train = embedded[train_idx]
# 50 test examples of 10 identities (5 examples each)
X_test = embedded[test_idx]

y_train = y[train_idx]
y_test = y[test_idx]

knn = KNeighborsClassifier(n_neighbors=1, metric='euclidean')
svc = LinearSVC()

knn.fit(X_train, y_train)
svc.fit(X_train, y_train)

acc_knn = accuracy_score(y_test, knn.predict(X_test))
acc_svc = accuracy_score(y_test, svc.predict(X_test))

print(f'KNN accuracy = {acc_knn}, SVM accuracy = {acc_svc}')

In [None]:
example_idx = 42

example_image = load_image("/Users/kwassibenocharlesdokodjo/Downloads/Sofiane_New.jpg") # metadata[test_idx][example_idx].image_path()
example_prediction = svc.predict([embedded_test])
example_identity = encoder.inverse_transform(example_prediction)[0]

plt.imshow(example_image)

p = svc.decision_function(np.array([embedded_test]))

probs = np.concatenate(np.exp(p)/np.sum(np.exp(p),axis=1))

if probs.max()>0.3:
    plt.title(f'Recognized as {example_identity}');
else:
    plt.title(f'Recognized as Unknown Person')

# FEW SHOT PARADIGM

## CNN Architecture and training

In [None]:
from model import create_model

nn4_small2 = create_model()

In [None]:
# Input for anchor, positive and negative images
in_a = Input(shape=(96, 96, 3))
in_p = Input(shape=(96, 96, 3))
in_n = Input(shape=(96, 96, 3))

# Output for anchor, positive and negative embedding vectors
# The nn4_small model instance is shared (Siamese network)
emb_a = nn4_small2(in_a)
emb_p = nn4_small2(in_p)
emb_n = nn4_small2(in_n)

class TripletLossLayer(Layer):
    def __init__(self, alpha, **kwargs):
        self.alpha = alpha
        super(TripletLossLayer, self).__init__(**kwargs)
    
    def triplet_loss(self, inputs):
        a, p, n = inputs
        p_dist = K.sum(K.square(a-p), axis=-1)
        n_dist = K.sum(K.square(a-n), axis=-1)
        return K.sum(K.maximum(p_dist - n_dist + self.alpha, 0), axis=0)
    
    def call(self, inputs):
        loss = self.triplet_loss(inputs)
        self.add_loss(loss)
        return loss

# Layer that computes the triplet loss from anchor, positive and negative embedding vectors
triplet_loss_layer = TripletLossLayer(alpha=0.2, name='triplet_loss_layer')([emb_a, emb_p, emb_n])

# Model that can be trained with anchor, positive negative images
nn4_small2_train = Model([in_a, in_p, in_n], triplet_loss_layer)

In [None]:
from data import triplet_generator

# triplet_generator() creates a generator that continuously returns 
# ([a_batch, p_batch, n_batch], None) tuples where a_batch, p_batch 
# and n_batch are batches of anchor, positive and negative RGB images 
# each having a shape of (batch_size, 96, 96, 3).
generator = triplet_generator() 

nn4_small2_train.compile(loss=None, optimizer='adam')
nn4_small2_train.fit_generator(generator, epochs=10, steps_per_epoch=100)

# Please note that the current implementation of the generator only generates 
# random image data. The main goal of this code snippet is to demonstrate 
# the general setup for model training. In the following, we will anyway 
# use a pre-trained model so we don't need a generator here that operates 
# on real training data. I'll maybe provide a fully functional generator
# later.

# TWO FACES ON AN IMAGE

In [None]:
import cv2
import matplotlib.pyplot as plt
import matplotlib.patches as patches

from align import AlignDlib

%matplotlib inline

def load_image(path):
    img = cv2.imread(path, 1)
    # OpenCV loads images with color channels
    # in BGR order. So we need to reverse them
    return img[...,::-1]

# Initialize the OpenFace face alignment utility
alignment = AlignDlib('models/landmarks.dat')

# Load an image of Olivier Auliard
jc_orig = load_image("/Users/kwassibenocharlesdokodjo/Downloads/Arthur_Charles.jpg")

plt.figure(figsize=(20,20))

# get all faces boundingboxes
bb = alignment.getAllFaceBoundingBoxes(jc_orig)

plt.subplot(141)
plt.imshow(jc_orig)

jc_aligned_0 = alignment.align(136, jc_orig, bb[0], landmarkIndices=AlignDlib.OUTER_EYES_AND_NOSE)
jc_aligned_1 = alignment.align(136, jc_orig, bb[1], landmarkIndices=AlignDlib.OUTER_EYES_AND_NOSE)


# Show original image with bounding box
plt.subplot(142)
plt.imshow(jc_orig)
plt.gca().add_patch(patches.Rectangle((bb[0].left(), bb[0].top()), bb[0].width(), bb[0].height(), fill=False, color='red'))
plt.gca().add_patch(patches.Rectangle((bb[1].left(), bb[1].top()), bb[1].width(), bb[1].height(), fill=False, color='red'))

# Show aligned image
plt.subplot(143)
plt.imshow(jc_aligned_0)

plt.subplot(144)
plt.imshow(jc_aligned_1);

In [None]:
n_faces=2

def align_images(img, n_faces):    
    bbs = alignment.getAllFaceBoundingBoxes(img)    
    aligns = np.array([alignment.align(96, img, bbs[i], landmarkIndices=AlignDlib.OUTER_EYES_AND_NOSE) for i in range(n_faces)])    
    return aligns

In [None]:
img = load_image("/Users/kwassibenocharlesdokodjo/Downloads/Arthur_Charles.jpg")
img = align_images(img, n_faces)
try:
    # scale RGB values to interval [0,1]
    img = (img / 255.).astype(np.float32)
    # obtain embedding vector for image
    embedded_test = []
    for i in range(n_faces):
        embedded_test.append(nn4_small2_pretrained.predict(np.expand_dims(img[i], axis=0))[0])
except TypeError:
    print("Doesn't work")

In [None]:
thresh = 0.25

def predict_image(image_path, n_faces):
    
    example_image = load_image(image_path)
    example_prediction = svc.predict(embedded_test)
    identities = list((encoder.inverse_transform(example_prediction)))
    probs = []
    for i in range(n_faces):
        p = svc.decision_function(np.array([embedded_test[i]]))
        probs.append(np.concatenate(np.exp(p)/np.sum(np.exp(p),axis=1)))   
    plt.imshow(example_image)
    
    max_values = [x.max() for x in probs]
    
    people = []
    if max_values[0]>thresh:
        people.append(identities[0])
    else:
        people.append('Unknown Person')
    
    if max_values[1]>thresh:
        people.append(identities[1])
    else:
        people.append('Unknown Person')
    
    people = ' and '.join(people)
    return plt.title(f'Recognized as {people}')