# Real-time face detection & ID identification model for fast check-in.

# Install necessary packages

In [1]:
%%shell
pip install facenet-pytorch
pip install mmcv

Collecting facenet-pytorch
  Downloading facenet_pytorch-2.5.2-py3-none-any.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 7.2 MB/s 
Installing collected packages: facenet-pytorch
Successfully installed facenet-pytorch-2.5.2
Collecting mmcv
  Downloading mmcv-1.4.7.tar.gz (500 kB)
[K     |████████████████████████████████| 500 kB 6.1 MB/s 
[?25hCollecting addict
  Downloading addict-2.4.0-py3-none-any.whl (3.8 kB)
Collecting yapf
  Downloading yapf-0.32.0-py2.py3-none-any.whl (190 kB)
[K     |████████████████████████████████| 190 kB 16.0 MB/s 
Building wheels for collected packages: mmcv
  Building wheel for mmcv (setup.py) ... [?25l[?25hdone
  Created wheel for mmcv: filename=mmcv-1.4.7-py2.py3-none-any.whl size=765992 sha256=b54fb42f670dacb26307193a1109203e7a555350b0e8373382831b05857187a2
  Stored in directory: /root/.cache/pip/wheels/0a/09/7c/2fda88b3e03791b04a99e5cf77516f49167f81da7aa7866535
Successfully built mmcv
Installing collected packages: yapf, addict, mm



# Import Dependencies

In [2]:
from IPython.display import Javascript, Image
from IPython import display as dis
from google.colab.output import eval_js
from base64 import b64decode, b64encode
import numpy as np
import io
import html
import time
from IPython.core.display import Video
from facenet_pytorch import MTCNN
import torch
import mmcv, cv2
import PIL
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
from google.colab import files
from google.colab import drive
import os
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
class FastMTCNN(object):
    """Fast MTCNN implementation."""
    
    def __init__(self, stride, resize=1, *args, **kwargs):
        """Constructor for FastMTCNN class.
        
        Arguments:
            stride (int): The detection stride. Faces will be detected every `stride` frames
                and remembered for `stride-1` frames.
        
        Keyword arguments:
            resize (float): Fractional frame scaling. [default: {1}]
            *args: Arguments to pass to the MTCNN constructor. See help(MTCNN).
            **kwargs: Keyword arguments to pass to the MTCNN constructor. See help(MTCNN).
        """
        self.stride = stride
        self.resize = resize
        self.mtcnn = MTCNN(*args, **kwargs)
        
    def __call__(self, frames):
        """Detect faces in frames using strided MTCNN."""
        if self.resize != 1:
            frames = [
                cv2.resize(f, (int(f.shape[1] * self.resize), int(f.shape[0] * self.resize)))
                    for f in frames
            ]
                      
        boxes, probs = self.mtcnn.detect(frames[::self.stride])

        faces = []
        for i, frame in enumerate(frames):
            box_ind = int(i / self.stride)
            if boxes[box_ind] is None:
                continue
            for box in boxes[box_ind]:
                box = [int(b) for b in box]
                faces.append(frame[box[1]:box[3], box[0]:box[2]])
        
        return faces

In [4]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Running on device: {}'.format(device))
# currently using, for face detection
mtcnn = MTCNN(keep_all=True, min_face_size=224, device=device)
# A faster model, for future performance improvment 
mtcnn_fast = FastMTCNN(keep_all=True, min_face_size=224, device=device,stride=4)

Running on device: cuda:0


# Part A: Extracting training data from input videos.

In [5]:
uploaded_video = files.upload() # For uploading training video to colab, click cancel if have no file to upload
user_name = "XiaoMing"          #User name of the above uploaded video

Saving Zijian_train.mov to Zijian_train.mov


## Frames extraction: Extract each frame of user face, save to google drive

In [7]:
video = mmcv.VideoReader('Zijian_train.mov') #Select training video here
frames = [Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) for frame in video] #Extracting frames from video
frames_tracked = []
path = '/content/drive/MyDrive/Colab_Notebooks/MIE1517/output'               #Put in the output directory you want 
os.mkdir(path)
os.chdir(path)
for i, frame in enumerate(frames):
    print('\rTracking frame: {}'.format(i + 1), end='')
    img = np.array(frame)    
    faces, _ = mtcnn.detect(frame)                                            #Return coordinates of faces from mtcnn model

    if faces is not None: 
      for (x,y,w,h) in faces:
        x_w_diff = int(w-x)
        y_h_diff = int(h-y)
        if x_w_diff > 223. and y_h_diff >223.:
          x_w_mid = int(x+(w-x)/2)
          y_h_mid = int(y+(h-y)/2)
          selected_x = x_w_mid - 155
          selected_w = x_w_mid + 155
          selected_y = y_h_mid - 260
          selected_h = y_h_mid + 50           
          selected_img = img[selected_y:selected_h, selected_x:selected_w, :] #Adjustment for cropping the correct face images.

          faces = Image.fromarray(selected_img).resize((224,224))             #Resizing the image to desired input size.
    
          gray_img = cv2.cvtColor(np.array(faces), cv2.COLOR_RGB2GRAY)        #Converting to Gray Scale images for training
          cv2.imwrite(user_name + str(i) + '.png', gray_img)
          #plt.imshow(gray_img, cmap = 'gray')                                #Displaying images only for developing, commented out in actual practice.
          #plt.show()                                                           
          

print('\nDone')

Tracking frame: 177
Done


## ID identification MODEL and TRAINING method CAN be placed below.

In [None]:
# HI~

# Part B: Real time face detection & ID identification 

## Helper method for creating video streaming

In [9]:
# function to convert the JavaScript object into an OpenCV image
def js_to_image(js_reply): 
  image_bytes = b64decode(js_reply.split(',')[1])        # decode base64 image
  jpg_as_np = np.frombuffer(image_bytes, dtype=np.uint8) # convert bytes to numpy array
  img = cv2.imdecode(jpg_as_np, flags=1)                 # decode numpy array into OpenCV BGR image

  return img

# function to convert OpenCV Rectangle bounding box image into base64 byte string to be overlayed on video stream
def bbox_to_bytes(ipt):
  # ipt: Numpy array (pixels) containing rectangle to overlay on video stream.
  # bytes: Base64 image byte string
  # convert array into PIL image
  bbox_PIL = PIL.Image.fromarray(ipt, 'RGBA')
  iobuf = io.BytesIO() 
  bbox_PIL.save(iobuf, format='png')                                                          # format bbox into png for return
  bbox_bytes = 'data:image/png;base64,{}'.format((str(b64encode(iobuf.getvalue()), 'utf-8'))) # format return string

  return bbox_bytes
  
# JavaScript to properly create our live video stream using our webcam as input
def video_stream():
  js = Javascript('''
    var video;
    var div = null;
    var stream;
    var captureCanvas;
    var imgElement;
    var labelElement;   
    var pendingResolve = null;
    var shutdown = false;
    
    function removeDom() {
       stream.getVideoTracks()[0].stop();
       video.remove();
       div.remove();
       video = null;
       div = null;
       stream = null;
       imgElement = null;
       captureCanvas = null;
       labelElement = null;
    }
    
    function onAnimationFrame() {
      if (!shutdown) {
        window.requestAnimationFrame(onAnimationFrame);
      }
      if (pendingResolve) {
        var result = "";
        if (!shutdown) {
          captureCanvas.getContext('2d').drawImage(video, 0, 0, 720, 720);
          result = captureCanvas.toDataURL('image/jpeg', 0.8)
        }
        var lp = pendingResolve;
        pendingResolve = null;
        lp(result);
      }
    }
    
    async function createDom() {
      if (div !== null) {
        return stream;
      }

      div = document.createElement('div');
      div.style.border = '2px solid black';
      div.style.padding = '3px';
      div.style.width = '100%';
      div.style.maxWidth = '600px';
      document.body.appendChild(div);
      
      const modelOut = document.createElement('div');
      modelOut.innerHTML = "<span>Status:</span>";
      labelElement = document.createElement('span');
      labelElement.innerText = 'No data';
      labelElement.style.fontWeight = 'bold';
      modelOut.appendChild(labelElement);
      div.appendChild(modelOut);
           
      video = document.createElement('video');
      video.style.display = 'block';
      video.width = div.clientWidth - 6;
      video.setAttribute('playsinline', '');
      video.onclick = () => { shutdown = true; };
      stream = await navigator.mediaDevices.getUserMedia(
          {video: { facingMode: "environment"}});
      div.appendChild(video);

      imgElement = document.createElement('img');
      imgElement.style.position = 'absolute';
      imgElement.style.zIndex = 1;
      imgElement.onclick = () => { shutdown = true; };
      div.appendChild(imgElement);
      
      const instruction = document.createElement('div');
      instruction.innerHTML = '<span style="color: red; font-weight: bold;">' + 'When finished, click here or on the video to stop this demo</span>';
      div.appendChild(instruction);
      instruction.onclick = () => { shutdown = true; };
      
      video.srcObject = stream;
      await video.play();

      captureCanvas = document.createElement('canvas');
      captureCanvas.width = 720; //video width; 1280
      captureCanvas.height = 720; //video height; 720
      window.requestAnimationFrame(onAnimationFrame);
      
      return stream;
    }
    async function stream_frame(label, imgData) {
      if (shutdown) {
        removeDom();
        shutdown = false;
        return '';
      }

      var preCreate = Date.now();
      stream = await createDom();
      
      var preShow = Date.now();
      if (label != "") {
        labelElement.innerHTML = label;
      }
            
      if (imgData != "") {
        var videoRect = video.getClientRects()[0];
        imgElement.style.top = videoRect.top + "px";
        imgElement.style.left = videoRect.left + "px";
        imgElement.style.width = videoRect.width + "px";
        imgElement.style.height = videoRect.height + "px";
        imgElement.src = imgData;
      }
      
      var preCapture = Date.now();
      var result = await new Promise(function(resolve, reject) {
        pendingResolve = resolve;
      });
      shutdown = false;
      
      return {'create': preShow - preCreate, 
              'show': preCapture - preShow, 
              'capture': Date.now() - preCapture,
              'img': result};
    }
    ''')

  display(js)
  
def video_frame(label, bbox):
  data = eval_js('stream_frame("{}", "{}")'.format(label, bbox))
  return data

## Face detection model implementations

### Re-implementing face detection model, in case some unexpected changes were made in part A

In [10]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Running on device: {}'.format(device))
# currently using, for face detection
mtcnn = MTCNN(keep_all=True, min_face_size=224, device=device)
# A faster model
mtcnn_fast = FastMTCNN(keep_all=True, min_face_size=224, device=device,stride=4)

Running on device: cuda:0


### Turning on webcam, start real-time ID recognition

In [11]:
from torch.functional import Tensor
video_stream()
label_html = 'Capturing Video...'
bbox = ''
while True:
    js_reply = video_frame(label_html, bbox)
    if not js_reply:
        break
    
    img = js_to_image(js_reply["img"])                            #Calling helper method to convert JS response to OpenCV Image

    rgb_img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    #frame_fast = torch.from_numpy(rgb_img) 
    #frame_fast = torch.stack([frame_fast], dim=0)                #Prepared input frame for mtcnn_fast face detection model
    frame = Image.fromarray(rgb_img)                              #Prepared input frame for mtcnn face detection model

    faces, _ = mtcnn.detect(frame)                                #Return coordinates of faces from mtcnn model
    #faces_fast = mtcnn_fast(frame_fast)                          #Higher throughput model, return captured face images, not coordinates.
    #plt.imshow(faces_fast[0].detach().numpy(), cmap = 'gray')    #Displaying images only for developing, commented out in actual practice.
    #plt.show()

    bbox_array = np.zeros([720,720,4], dtype=np.uint8)            #Creating a transparent overlay for drawing bounding box

    if faces is not None: 
      for (x,y,w,h) in faces:
        x_w_diff = int(w-x)
        y_h_diff = int(h-y)
        if x_w_diff > 223. and y_h_diff >223.:                    #Size filtering, detected faces need to be clear enough (224 x 224)
          x_w_mid = int(x+(w-x)/2)                                #Finding the middle point along the x-axis
          y_h_mid = int(y+(h-y)/2)                                #Finding the middle point along the y-axis
          selected_x = x_w_mid - 140
          selected_w = x_w_mid + 140
          selected_y = y_h_mid - 260
          selected_h = y_h_mid + 20                               
 
          selected_img = img[selected_y:selected_h, selected_x:selected_w, :]  #Adjustment for cropping the correct face images.

          faces = Image.fromarray(selected_img).resize((224,224))
    
          gray_img = cv2.cvtColor(np.array(faces), cv2.COLOR_RGB2GRAY)
          #plt.imshow(gray_img, cmap = 'gray')                                 #Displaying face images during developing, commented out in actual practice
          #plt.show()
          
          input_for_face_recognition = torch.from_numpy(gray_img) 
          input_for_face_recognition = torch.stack([input_for_face_recognition], dim=0) #Prepared input in tensor format, Ready for our ID detection model
          

          # Facial identification process will be placed here









          
          box_height = (w,int(h-(20)))                                           #Adjusting the size of the bounding box to indicate detected faces.
          bbox_array = cv2.rectangle(bbox_array,(x,y), box_height, (0,255,0), 2) #Bounding box size and colour
      bbox_array[:,:,3] = (bbox_array.max(axis = 2) > 0 ).astype(int) * 255  
      bbox_bytes = bbox_to_bytes(bbox_array)                                     # converting overlay of bbox into bytes  
      bbox =bbox_bytes                                                           # update bbox for the next frame  

<IPython.core.display.Javascript object>

In [12]:
%%shell
jupyter nbconvert --to html Project_Face_detection.ipynb

This application is used to convert notebook files (*.ipynb)
        to various other formats.


Options
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--show-config
    Show the application's configuration (human-readable format)
    Equivalent to: [--Application.show_config=True]
--show-config-json
    Show the application's configuration (json format)
    Equivalent to: [--Application.show_config_json=True]
--generate-config
    generate default config file
    Equivalent to: [--JupyterApp.generate_config=True]
-y
    Answer yes to any questions instead of prompting.
    Equivalent to: [--JupyterApp.answer_yes=True]
--execute
    Execute the notebook prior to export.
    Equivalent to: [--ExecutePr

CalledProcessError: ignored