# Private XGBoost on Fingerprint

### Set up

Install dependencies from https://github.com/awslabs/privacy-preserving-xgboost-inference

In [1]:
from ppxgboost import PPPrediction as prediction
import ppxgboost.PPModel as PPModel
from ppxgboost import PaillierAPI as paillier
import ppxgboost.OPEMetadata as OPEMetadata
import ppxgboost.PPKey as PPKey
import ppxgboost.PPQuery as PPQuery
import random
import time
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier
from secrets import token_bytes
import pyope.ope as pyope
import matplotlib.pyplot as plt
import joblib
import os
import cv2
import glob
import imageio
from skimage.morphology import skeletonize, thin
from skimage import io, img_as_bool, img_as_ubyte
from skimage.filters import threshold_otsu
from skimage.feature import canny


In [2]:
def preprocess_image(image):

    # Apply Gaussian blur to reduce noise
    image = cv2.GaussianBlur(image, (3, 3), 0)

    # Binarize the image using Otsu's threshold
    _, binary_image = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

   
    skeleton = skeletonize(binary_image//255)
    skeleton = img_as_ubyte(skeleton)  

    return skeleton

def extract_minutiae(skeleton):
   
    kernel = np.uint8([[0, 0, 0], [1, 1, 0], [0, 1, 0]])
    minutiae = cv2.morphologyEx(skeleton, cv2.MORPH_HITMISS, kernel)

    # Convert minutiae points to coordinates
    minutiae_locations = np.column_stack(np.where(minutiae > 0))

    return minutiae_locations

def create_feature_vector(minutiae_points, image_shape, grid_size=(8, 8)):
    feature_vector = np.zeros(grid_size[0] * grid_size[1])

    # Calculate cell size
    cell_height = image_shape[0] // grid_size[0]
    cell_width = image_shape[1] // grid_size[1]

    for minutia in minutiae_points:
        # Determine the grid cell for this minutia
        row = minutia[0] // cell_height
        col = minutia[1] // cell_width

        # Calculate the index in the feature vector
        index = row * grid_size[1] + col
        feature_vector[index] += 1  

    return feature_vector

     

In [3]:
# Define a function to load images from a folder
def load_images_from_folder(folder):

  # Create empty lists to store the images and labels
  images = []
  labels = []
  features = []

  # Iterate over all of the files in the folder
  for filename in os.listdir(folder):

    # Read the image into memory
    img = cv2.imread(os.path.join(folder,filename), cv2.IMREAD_GRAYSCALE)
    img = cv2.resize(img, (160,160))
    # Check to make sure that the image is not None
    if img is not None:
          skeleton_image = preprocess_image(img)
          minutiae_feature = extract_minutiae(skeleton_image)
          # Add the image and label to the corresponding lists
          images.append(img)
          labels.append(int(filename.split('_')[0].split('.')[0]))
          features.append(create_feature_vector(minutiae_feature,img.shape))

  # Return the images and labels
  return images, labels, features

In [4]:
# change location of dateset if necessary
_, y_train, x_train = load_images_from_folder('../data/dataset_FVC2000_DB4_B/dataset/train_data')
_, y_test, x_test = load_images_from_folder('../data/dataset_FVC2000_DB4_B/dataset/real_data')

In [5]:
# grid
col = 8
row = 8
grid = []
for i in range(row):
    for j in range(col):
        grid.append("("+str(i)+","+str(j)+")")


In [6]:
# Convert the list of training images to a NumPy array
x_train = pd.DataFrame(x_train)

# Convert the list of test images to a NumPy array
x_test = pd.DataFrame(x_test)

# Print the number of training images and the shape of the first training image
print("Number of training images:", len(x_train))
print("Shape of the training:", x_train.shape)

y_train = np.asarray(y_train)
y_test = np.asarray(y_test)

Number of training images: 800
Shape of the training: (800, 64)


In [7]:
x_train.columns = grid
x_test.columns = grid

In [8]:
total_estimaters = 20
model = xgb.XGBClassifier(n_estimators=total_estimaters, objective='multi:softmax')
model.fit(x_train, y_train)

In [9]:
from sklearn.metrics import accuracy_score
y_pred = model.predict(x_test)
predictions = [round(value) for value in y_pred]

In [11]:
num_classes = model.n_classes_

# Create custom data ranges
in_range = pyope.ValueRange(pyope.DEFAULT_IN_RANGE_START, 2 ** 43 - 1)
out_range = pyope.ValueRange(pyope.DEFAULT_OUT_RANGE_START, 2 ** 63 - 1)

# parse the tree
ppModel = PPModel.from_xgboost_model(model.get_booster())
features = ppModel.get_features()
#  (add fake test data range here as this testing only test the model correctness)
metadata = OPEMetadata.OPEMetadata(ppModel, 0, 100, in_range.end)

In [12]:
# # Set up encryption materials.
ppModelKey, ppQueryKey = PPKey.generatePPXGBoostKeys(in_range, out_range)

# 1. process the tree into ope_enc_tree
enc_model = ppModel.encrypt(ppModelKey, metadata)


In [13]:
# 2. Encrypts the input vector for prediction (using prf_key_hash and ope-encrypter) based on the feature set.
import ppxgboost.PPQuery as PPQuery
queryEncryptor = PPQuery.QueryEncryptor(ppQueryKey, features, metadata)
queries = PPQuery.pandas_to_queries(x_test)
enc_queries = PPQuery.encrypt_queries(queryEncryptor, queries)

In [15]:
# # 3. OPE evaluation based on OPE encrypted values in the tree nodes.
enc_predictions = prediction.predict_multiclass(enc_model, num_classes, enc_queries)

In [18]:
result = prediction.client_decrypt_prediction_multiclass(ppQueryKey.get_private_key(), enc_predictions)

In [20]:
real_y = model.predict(x_test)
assert np.array_equal(result, real_y)
print("success!")


success!
