# Fingerprint Identification System

## Import Libraries

In [None]:
# OS
import os

# Arrays
import numpy as np 

# Tensorflow and Keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import load_model

# Sklearn
import sklearn
from sklearn.model_selection import train_test_split

# Math
import math

# Plots
import matplotlib as mpl
mpl.rc('image', cmap='gray') # set color map to gray when plotting images
from matplotlib import pyplot as plt 

# Fix seeds and create tensorflow session for reproducible results
SEED = 42
np.random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
tf.random.set_seed(SEED)
from keras import backend as K
session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
tf.compat.v1.keras.backend.set_session(sess)

## Load Strategy for TPU, GPU or CPU Acceleration

In [None]:
try:
    # TPU detection
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
else:
    # By default, GPU or CPU
    strategy = tf.distribute.get_strategy()

## Load Data

In [None]:
train_images_file = '/kaggle/input/fingerprint-dataset-for-fvc2000-db4-b/dataset_FVC2000_DB4_B/dataset/np_data/img_train.npy';
train_labels_file = '/kaggle/input/fingerprint-dataset-for-fvc2000-db4-b/dataset_FVC2000_DB4_B/dataset/np_data/label_train.npy';
images_data = np.load('/kaggle/input/fingerprint-dataset-for-fvc2000-db4-b/dataset_FVC2000_DB4_B/dataset/np_data/img_train.npy')
labels_data = np.load('/kaggle/input/fingerprint-dataset-for-fvc2000-db4-b/dataset_FVC2000_DB4_B/dataset/np_data/label_train.npy')

10 classes of 80 samples each

In [None]:
np.unique(labels_data, return_counts=True) 

In [None]:
plt.imshow(images_data[0]) # plot a sample

## Configuration

Hyperparameters for the model

In [None]:
params = dict()
params['image_size'] = (160, 160, 3) # To resize the image
params['num_classes'] = 10 # Number of classes
params['max_epochs'] = 75 # Number of epochs for training the model
params['batch_size'] = 1 # Low batch takes more time to run but gives more weight to the stochastic properties of SGD
params['lr'] = 0.001 # Learning rate
params['seed'] = SEED # Seed for reproducibility
params['validation_size'] = 0.25 #S ize of validation set

## Train Validation Split

Split the train data into a train and validation set in a stratified way

In [None]:
train_images, validation_images, train_labels, validation_labels = train_test_split(images_data, 
                                                                                    labels_data, 
                                                                                    test_size=params['validation_size'], 
                                                                                    random_state=params['seed'],
                                                                                    stratify=labels_data)
print(f"Train set length: {len(train_images)}")
print(f"Validation set length: {len(validation_images)}")
print(f"Train set -> number of unique labels: {np.unique(train_labels, return_counts=True)}")
print(f"Validation set -> number of unique labels: {np.unique(validation_labels, return_counts=True)}")

## Data Loader

In [None]:
class DataLoader(keras.utils.Sequence):
    def __init__(self, images, labels=None, params=None, batch_size=32, shuffle=False):
        self.images = images.astype(np.float32) / 255. # Scale the images to range [0, 1]
        self.images = np.repeat(self.images, 3, axis=-1) # convert from grayscale to RGB
        self.labels = labels 
        self.batch_size = batch_size
        self.train = (labels is not None)
        if shuffle == True: # Shuffle the dataset
            if self.train:
                self.images, self.labels = sklearn.utils.shuffle(self.images, self.labels)
            else:
                self.images = sklearn.utils.shuffle(self.images)
                
        if params != None: 
            self.images = tf.image.resize(self.images, params['image_size'][:2]) # Resize image

    def __len__(self): 
        'Get length of the data loader in number of batches'
        return int(np.floor(len(self.images) / self.batch_size)) # floor the size (excludes the last batch if it is incomplete)

    def __getitem__(self, index):
        'Retrieve a specific batch of data'
        images_batch = self.images[index*self.batch_size:(index+1)*self.batch_size]
        if self.train:
            labels_batch = self.labels[index*self.batch_size:(index+1)*self.batch_size, 0]
        
        if self.train: 
            return (images_batch, labels_batch)
        return (images_batch, -np.ones(images_batch.shape[0])) # if test set, we do not have any labels

## Plot Batch of Samples

In [None]:
def plot_batch(images_batch, labels_batch, predicted_labels=[], cols=5, figsize=(24, 6)):
    'Plot a batch of samples with its corresponding labels. Green labels for correctly predicted outputs, red otherwise'
    rows = int(np.ceil(len(labels_batch) / cols))
    figure, ax = plt.subplots(nrows=rows, ncols=cols, figsize=figsize)
    i = 0
    for image, label in zip(images_batch, labels_batch):
        title = predicted_labels[i] + ' | ' + label if predicted_labels else label
        color = "green" if not predicted_labels or label == predicted_labels[i] else "red"
        ax.ravel()[i].imshow(image) #np.transpose(image, (1, 2, 0))
        ax.ravel()[i].set_title(title, color=color)
        ax.ravel()[i].set_axis_off()
        i += 1
    plt.tight_layout()
    plt.show()

Let's visualize a batch of fingerprints with their labels

In [None]:
train_loader = DataLoader(train_images, train_labels, params=params, batch_size=20, shuffle=True)
batch = train_loader[0]
plot_batch(batch[0], batch[1], cols=10)

## Train & Validation Data Loader

The train and validation data loaders allow to retrieve batch of samples for training a neural network

In [None]:
train_loader = DataLoader(train_images, train_labels, params=params, batch_size=params['batch_size'], shuffle=True)
validation_loader = DataLoader(validation_images, validation_labels, params=params, batch_size=params['batch_size'], shuffle=True)

## Image Classification Model

In [None]:
import tensorflow_addons as tfa
with strategy.scope():
    # Load as backbone a topless pretrained ResNet50V2
    pretrained_model = tf.keras.applications.ResNet50V2(
        include_top=False,
        weights="imagenet",
        input_tensor=None,
        input_shape=None,
        pooling=None,
        classes=params['num_classes'],
        classifier_activation="softmax",
    )
    
    # Fix the backbone layer's parameters, we will not train them
    for layer in pretrained_model.layers:
        layer.trainable = False
    
    image = tf.keras.layers.Input(shape = params['image_size'])
    print(image.shape)
    x = pretrained_model(image)
    print(x.shape)
    # Global Average Pooling to reduce the number of dimensions of the features
    x = tf.keras.layers.GlobalAveragePooling2D()(x) 
    print(x.shape)
    # A dense layer to be the remaining features to the number of classes to predict
    x = tf.keras.layers.Dense(params['num_classes'])(x) 
    print(x.shape)
    # A softmax layer to convert outputs into probability estimates
    output = tf.keras.layers.Softmax(dtype='float32')(x)
    print(output.shape)
    model = tf.keras.models.Model(inputs = [image], outputs = [output])   

## Loss, Performance Metric, Optimizer

In [None]:
with strategy.scope():
    # Cross-Entropy Loss
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    
    # Adam optimizer
    optimizer = tf.keras.optimizers.Adam(learning_rate = params['lr'])

#     def get_lr_metric(optimizer):
#         def lr(y_true, y_pred):
#             return optimizer._decayed_lr(tf.float32)
#         return lr

#     lr_metric = get_lr_metric(optimizer)

    # Accuracy Metric
    accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

    metrics = [accuracy]#, lr_metric] 
    
    # Compile the model
    model.compile(optimizer=optimizer,
                  loss=loss,
                  metrics=metrics)

## Callbacks

In [None]:
callbacks = []

### Scheduler

Decrease the learning rate by exponential decay

In [None]:
scheduler = tf.keras.optimizers.schedules.ExponentialDecay(initial_learning_rate=params['lr'],
                                                           decay_steps=1,
                                                           decay_rate=0.99,
                                                           staircase=True)
scheduler_callback = tf.keras.callbacks.LearningRateScheduler(scheduler)
callbacks.append(scheduler_callback)

### Model Checkpoints

Save the model with the best accuracy

In [None]:
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
                                            './model/fingerprint-recognition.h5',
                                            monitor='val_sparse_categorical_accuracy',#'val_loss',
                                            verbose=0,
                                            save_best_only=True,
                                            save_weights_only=False,
                                            mode='auto',
                                            save_freq='epoch',
                                            options=None,
                                            initial_value_threshold=None)
callbacks.append(checkpoint_callback)

## Train Model

In [None]:
history = model.fit(train_loader, 
                    validation_data=validation_loader,
                    epochs = params['max_epochs'],
                    batch_size = params['batch_size'],
                    callbacks = callbacks)

### Plot Losses and Performance Metrics

In [None]:
fig = plt.figure(figsize=(10, 5))
#print(history.history.keys())

plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss')
plt.xlabel('Epoch')
plt.legend()
plt.savefig('./nn_fingerprint_loss.png')
plt.show()

fig = plt.figure(figsize=(10, 5))
plt.plot(history.history['sparse_categorical_accuracy'], label='Train Accuracy')
plt.plot(history.history['val_sparse_categorical_accuracy'], label='Validation Accuracy')
plt.title('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.savefig('./nn_fingerprint_accuracy.png')
plt.show()

## Model Architecture

In [None]:
model.summary()

# NN-Based Fingerprint Identification System

Let's now build a fingerprint identification system based on the nn features extracted from the fingerprints

## Import Additional Libraries

In [None]:
import cv2

import pandas as pd

from pathlib import Path

from tqdm.notebook import tqdm as tqdm_notebook

## Load Test Data

In [None]:
def read_DB(path):
    images = []
    labels = []
    imagePaths = sorted(Path(path).rglob("*.png"))
    for imagePath in tqdm_notebook(imagePaths):
        image = cv2.imread(path + imagePath.name)
        if (len(image.shape) > 2):
            image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        images.append(image)
        label = imagePath.stem[0:3]
        labels.append(label)
    return (images, labels)


# Read the fingerprint database
images_db, labels_db = read_DB('../input/NIST301/')

# Save some metadata
n_imgs = len(images_db)
img_height, img_width = images_db[0].shape

In [None]:
test_loader = DataLoader(np.expand_dims(np.array(images_db), -1), params=params, batch_size=len(images_db))

## Load Trained Model

In [None]:
fingerprint_recognition_model = load_model('../input/fingerprint-recognition-dnn/fingerprint-recognition.h5');
# We remove the softmax layer because the purpose of this model is not to predict a class but to retrieve image embeddings
fingerprint_embedding_model = keras.models.Model(inputs=fingerprint_recognition_model.input, 
                                                 outputs=fingerprint_recognition_model.layers[-2].output)

## Retrieve Test and Perpetrator Fringerprint Embeddings

In [None]:
test_embeddings = fingerprint_embedding_model(test_loader[0][0])
perpetrator_embedding = test_embeddings[-1]
test_embeddings = test_embeddings[0:-1]

## Similarity Measure

In [None]:
mss = lambda x,y: 1/(1+np.square(x-y).mean())

## Helper Functions

Plot a sequence of images

In [None]:
# Helper functions
def plot_image_sequence(data, n, imgs_per_row=7, figsize=(10,10), cmap='gray'):
    n_rows = 1 + int(n/(imgs_per_row+1))
    n_cols = min(imgs_per_row, n)

    f,ax = plt.subplots(n_rows,n_cols, figsize=(figsize[0]*n_cols,figsize[1]*n_rows))
    for i in range(n):
        if n == 1:
            ax.imshow(data[i], cmap=cmap)
        elif n_rows > 1:
            ax[int(i/imgs_per_row),int(i%imgs_per_row)].imshow(data[i], cmap=cmap)
        else:
            ax[int(i%n)].imshow(data[i], cmap=cmap)
    plt.show()
    return f, ax

Construct a table of similarity scores between a target image and many candidate images

In [None]:
def constructSimilarityTable(org_img, img_db, labels, dist_func):
    #dist_func is the function that computes the distance between two images
    data=[]
    for i,img in enumerate(img_db):
        data.append([
            labels[i],
            dist_func(org_img, img)])
    assert (len(data) == len(img_db))
    return pd.DataFrame(data, columns=['id', 'score'])

## Compute the similarity scores 
between the perpetrator fingerprint embeddings and all candidate fingerprint embeddings

In [None]:
sim_tb_nn = constructSimilarityTable(perpetrator_embedding, test_embeddings, labels_db, mss)

In [None]:
sim_tb_nn = sim_tb_nn.sort_values(by='score', ascending=False)
ids,scores = sim_tb_nn.values[:,0], sim_tb_nn.values[:,1]
fig = plt.figure(figsize=(12, 6))
plt.plot(ids,scores)
plt.xticks(np.arange(0,100,3), ids[np.arange(0,100,3)], rotation=45)
plt.title('Highest Score ID: ' + ids[0]);
plt.xlabel('Fingerprint ID')
plt.ylabel('Fingerprint NN Features Match Score')
fig.savefig('fingerprint-nn-features-match-scores.png')

[print(i+1, sim_tb_nn.iloc[i,0], f"{sim_tb_nn.iloc[i,1]:.5f}") for i in range(10)];

Let's have a look at the best matches for the perpetrator

In [None]:
# Perpetrator's fingerprint
plt.imshow(images_db[-1], cmap='gray')

14 best matches: the 6 first do correspond to the perpetrator's fingerprint.

In [None]:
fig, ax = plot_image_sequence([images_db[int(sim_tb_nn.iloc[i,0])-1] for i in range(14)], 14)
fig.savefig('nn-best-fingerprint-matches.png')