In [3]:
#!sudo pip install flask
#import sys
#!{sys.executable} -m pip install flask

In [16]:
from flask import Flask, request
import requests

# Load labels
from io import BytesIO
from tensorflow.python.lib.io import file_io

# For Dataset
import os
import glob
import json
import time
import torch
import cv2
from PIL import Image
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from facenet_pytorch import MTCNN, InceptionResnetV1

# For Model
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import ZeroPadding2D, Convolution2D, MaxPooling2D, Dropout, Flatten, Activation, Dense
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.models import load_model

# Resizing for tf shape
import einops

# Save result for later use
from numpy import asarray
from numpy import save
import joblib

In [6]:
# Source: https://www.kaggle.com/timesler/facial-recognition-model-in-pytorch
class DetectionPipeline:
    """Pipeline class for detecting faces in the frames of a video file."""
    
    def __init__(self, detector, n_frames=None, batch_size=128, resize=None):
        """Constructor for DetectionPipeline class.
        
        Keyword Arguments:
            n_frames {int} -- Total number of frames to load. These will be evenly spaced
                throughout the video. If not specified (i.e., None), all frames will be loaded.
                (default: {None})
            batch_size {int} -- Batch size to use with MTCNN face detector. (default: {32})
            resize {float} -- Fraction by which to resize frames from original prior to face
                detection. A value less than 1 results in downsampling and a value greater than
                1 result in upsampling. (default: {None})
        """
        self.detector = detector
        self.n_frames = n_frames
        self.batch_size = batch_size
        self.resize = resize
    
    def __call__(self, filename):
        """Load frames from an MP4 video and detect faces.

        Arguments:
            filename {str} -- Path to video.
        """
        # Create video reader and find length
        v_cap = cv2.VideoCapture(filename)
        v_len = int(v_cap.get(cv2.CAP_PROP_FRAME_COUNT))

        # Pick 'n_frames' evenly spaced frames to sample
        if self.n_frames is None:
            sample = np.arange(0, v_len)
        else:
            sample = np.linspace(0, v_len - 1, self.n_frames).astype(int)

        # Loop through frames
        faces = []
        frames = []
        for j in range(v_len):
            success = v_cap.grab()
            if j in sample:
                # Load frame
                success, frame = v_cap.retrieve()
                if not success:
                    continue
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frame = Image.fromarray(frame)
                
                # Resize frame to desired size
                if self.resize is not None:
                    frame = frame.resize([int(d * self.resize) for d in frame.size])
                frames.append(frame)

                # When batch is full, detect faces and reset frame list
                if len(frames) % self.batch_size == 0 or j == sample[-1]:
                    faces.extend(self.detector(frames))
                    frames = []

        v_cap.release()

        return faces

In [8]:
# Source: https://www.kaggle.com/timesler/facial-recognition-model-in-pytorch
def process_faces(faces, feature_extractor):
    
    # Filter out frames without faces
    faces = [f for f in faces if f is not None]
    if len(faces) == 0 or faces == None:
        return None
    
    embeddings = []
    for i, face in zip(range(len(faces)), faces):
        # Return a handful of faces
        if i==25:
            break
        
        # Transforming image to what TF wants
        face = einops.rearrange(face,'b c w h -> b w h c')
        #face = einops.rearrange(face, 'b c w h -> b (c h w)')
        
        # Extract the features
        feature = feature_extractor(tf.cast(face, tf.float32))
        embeddings.append(feature)
 
    # Calculate centroid for video and distance of each face's feature vector from centroid
    centroid = [sum(embed)/len(embeddings) for embed in zip(*embeddings)]
    
    distances = []
    for embed in embeddings:
        distance = [a-b for a,b in zip(embed, centroid)]
        
        # Normalize distance
        y = tf.linalg.normalize(distance[3])
        y = tf.convert_to_tensor(y[0])
        #print(6)
        # Scale values of tensor to be between 0 and 1
        y = tf.truediv(
            tf.subtract(
                y,
                tf.reduce_min(y)
            ),
            tf.subtract(
                tf.reduce_max(y),
                tf.reduce_min(y)
            )
        )
        
        # Global average pooling to reduce to face shape for training
        #dist = einops.reduce(y,'b w h c -> h w', 'mean')
        
        # Convert back to what Keras wants
        #dist = einops.rearrange(y,'b w h c -> b c w h')
        #for x in dist:
        #    distances.append(x)
        #distances.append(dist)
        
        for x in y:
            distances.append(x)
        
    #distances = np.asarray(distances).astype('float32')
    return distances

In [33]:
# Model for original frames
# From the paper https://www.robots.ox.ac.uk/~vgg/publications/2015/Parkhi15/parkhi15.pdf

model_frames = keras.Sequential()
model_frames.add(ZeroPadding2D((1,1),input_shape=(160,160, 3)))
model_frames.add(Convolution2D(64, (3, 3), activation='relu'))
#model_frames.add(ZeroPadding2D((1,1)))
#model_frames.add(Convolution2D(64, (3, 3), activation='relu'))
model_frames.add(MaxPooling2D((2,2), strides=(2,2)))
 
model_frames.add(ZeroPadding2D((1,1)))
model_frames.add(Convolution2D(128, (3, 3), activation='relu'))
#model_frames.add(ZeroPadding2D((1,1)))
#model_frames.add(Convolution2D(128, (3, 3), activation='relu'))
model_frames.add(MaxPooling2D((2,2), strides=(2,2)))
 
model_frames.add(ZeroPadding2D((1,1)))
model_frames.add(Convolution2D(256, (3, 3), activation='relu'))
#model_frames.add(ZeroPadding2D((1,1)))
#model_frames.add(Convolution2D(256, (3, 3), activation='relu'))
#model_frames.add(ZeroPadding2D((1,1)))
#model_frames.add(Convolution2D(256, (3, 3), activation='relu'))
model_frames.add(MaxPooling2D((2,2), strides=(2,2)))
 
model_frames.add(ZeroPadding2D((1,1)))
model_frames.add(Convolution2D(512, (3, 3), activation='relu'))
#model_frames.add(ZeroPadding2D((1,1)))
#model_frames.add(Convolution2D(512, (3, 3), activation='relu'))
#model_frames.add(ZeroPadding2D((1,1)))
#model_frames.add(Convolution2D(512, (3, 3), activation='relu'))
model_frames.add(MaxPooling2D((2,2), strides=(2,2)))
 
model_frames.add(ZeroPadding2D((1,1)))
model_frames.add(Convolution2D(512, (3, 3), activation='relu'))
#model_frames.add(ZeroPadding2D((1,1)))
#model_frames.add(Convolution2D(512, (3, 3), activation='relu'))
#model_frames.add(ZeroPadding2D((1,1)))
#model_frames.add(Convolution2D(512, (3, 3), activation='relu'))
model_frames.add(MaxPooling2D((2,2), strides=(2,2)))

model_frames.add(Convolution2D(4096, (5, 5), activation='relu'))
model_frames.add(Dropout(0.5))
model_frames.add(Convolution2D(4096, (1, 1), activation='relu'))
model_frames.add(Dropout(0.5))
model_frames.add(Convolution2D(2622, (1, 1)))
model_frames.add(Flatten())
model_frames.add(Activation('softmax'))

model_frames.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
zero_padding2d (ZeroPadding2 (None, 162, 162, 3)       0         
_________________________________________________________________
conv2d (Conv2D)              (None, 160, 160, 64)      1792      
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 80, 80, 64)        0         
_________________________________________________________________
zero_padding2d_1 (ZeroPaddin (None, 82, 82, 64)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 80, 80, 128)       73856     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 40, 40, 128)       0         
_________________________________________________________________
zero_padding2d_2 (ZeroPaddin (None, 42, 42, 128)       0

In [34]:
feature_extractor = keras.Model(
    inputs=model_frames.inputs,
    outputs=[layer.output for layer in model_frames.layers],
)

In [9]:
# Load face detector
face_detector = MTCNN(margin=14, keep_all=True, factor=0.).eval()

# Load facial recognition model
# feature_extractor = InceptionResnetV1(pretrained='vggface2').eval()

# Define face detection pipeline
detection_pipeline = DetectionPipeline(detector=face_detector, n_frames=None, batch_size=128, resize=0.25)

In [35]:
# Load model into object
model = load_model('model-e10-24.h5')

In [None]:
app = Flask(__name__)

@app.route('/')
def home():
    return "Hello, world!"

# Create /predict endpoint with POST method
@app.route('/predict', methods=['POST'])
def predict():
    request_json = request.json
#     print("data: {}").format(request_json)
#     print("type: {}").format(type(request_json))
    
    # Predict from 'data' field in request body,
    # and return prediction in response body
    
    vid = request_json.get('videoUrl')

    X = []
    start = time.time()
    n_processed = 0
    with torch.no_grad():
        try:

            # Load frames and find faces
            faces = detection_pipeline(vid)

            features = process_faces(faces, feature_extractor)

            if features != None:
                for z in features:
                    assert not np.any(np.isnan(z))
                    X.append(z)

        except KeyboardInterrupt:
            print('\nStopped.')
        except Exception as e:
            print(e)

            
        n_processed += len(faces)
        print(f'Frames per second (load+detect+embed): {n_processed / (time.time() - start):6.3}\r', end='')
        
        # Convert to array for prediction
        X = np.asarray(X).astype('float32')

    if X != None:
        prediction = model.predict(X)
        prediction_string = str(prediction)
    else:
        prediction_string = ["No face found"]
    
    #prediction_string = [str(d) for d in prediction]
    response_json = {
        'data' : request_json.get('videoUrl'),
        'prediction' : list(prediction_string)
    }
    
    return json.dumps(response_json)

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)
    #app.run()

 * Serving Flask app '__main__' (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on all addresses.
 * Running on http://10.148.0.2:5000/ (Press CTRL+C to quit)
34.87.131.25 - - [08/Jun/2021 13:05:54] "POST /predict HTTP/1.1" 200 -


Frames per second (load+detect+embed): 1.29e+02

##### Prediction label based using model.fit (manual)

In [None]:
# Load result into array
file = open('X-24', 'rb')
X = np.load(file, allow_pickle=True)
file.close

y = np.load('Y-24')

In [None]:
# 80:20 for split size
size = int(len(X) * 0.8)

x_train, x_test = X[:size], X[size:]
y_train, y_test = y[:size], y[size:]

In [11]:
# Free unused memories
import gc
del X
del y
del x_train
del y_train
gc.collect()

2419

In [14]:
# Evaluate the model on test data
results = model.evaluate(x_test, y_test, batch_size=128)

# Generate predictions
# For example, generate predictions for 3 samples
predictions = model.predict(x_test[:3])
print(predictions.shape)

(3, 1)


In [15]:
predictions

array([[0.7547928 ],
       [0.75821173],
       [0.75627834]], dtype=float32)

##### Prediction probabilities using face processing (X) manual

In [None]:
# Prediction probability of face using X
filenames = '../resources/dataset/test_videos/*.mp4'

# Prediction
bias = -0.4
weight = 0.68235746

probabilities = []
for filename, x_i in zip(filenames, X):
    if x_i is not None:
        prob = 1 / (1 + np.exp(-(bias + (weight * x_i).mean())))
    else:
        prob = 0.5
    probabilities.append([os.path.basename(filename), prob])