In [1]:
import multiprocessing
num_cpu = multiprocessing.cpu_count()
num_cpu

8

In [2]:
# For Dataset
import os
import glob
import json
import time
import torch
import cv2
from PIL import Image
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from facenet_pytorch import MTCNN, InceptionResnetV1

# For Model
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import ZeroPadding2D, Convolution2D, MaxPooling2D, Dropout, Flatten, Activation, Dense
from tensorflow.keras.optimizers import SGD

# Resizing for tf shape
import einops

# Save result for later use
from numpy import asarray
from numpy import save

#from keras.models import load_model


In [3]:
# Source: https://www.kaggle.com/timesler/facial-recognition-model-in-pytorch
class DetectionPipeline:
    """Pipeline class for detecting faces in the frames of a video file."""
    
    def __init__(self, detector, n_frames=None, batch_size=128, resize=None):
        """Constructor for DetectionPipeline class.
        
        Keyword Arguments:
            n_frames {int} -- Total number of frames to load. These will be evenly spaced
                throughout the video. If not specified (i.e., None), all frames will be loaded.
                (default: {None})
            batch_size {int} -- Batch size to use with MTCNN face detector. (default: {32})
            resize {float} -- Fraction by which to resize frames from original prior to face
                detection. A value less than 1 results in downsampling and a value greater than
                1 result in upsampling. (default: {None})
        """
        self.detector = detector
        self.n_frames = n_frames
        self.batch_size = batch_size
        self.resize = resize
    
    def __call__(self, filename):
        """Load frames from an MP4 video and detect faces.

        Arguments:
            filename {str} -- Path to video.
        """
        # Create video reader and find length
        v_cap = cv2.VideoCapture(filename)
        v_len = int(v_cap.get(cv2.CAP_PROP_FRAME_COUNT))

        # Pick 'n_frames' evenly spaced frames to sample
        if self.n_frames is None:
            sample = np.arange(0, v_len)
        else:
            sample = np.linspace(0, v_len - 1, self.n_frames).astype(int)

        # Loop through frames
        faces = []
        frames = []
        for j in range(v_len):
            success = v_cap.grab()
            if j in sample:
                # Load frame
                success, frame = v_cap.retrieve()
                if not success:
                    continue
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frame = Image.fromarray(frame)
                
                # Resize frame to desired size
                if self.resize is not None:
                    frame = frame.resize([int(d * self.resize) for d in frame.size])
                frames.append(frame)

                # When batch is full, detect faces and reset frame list
                if len(frames) % self.batch_size == 0 or j == sample[-1]:
                    faces.extend(self.detector(frames))
                    frames = []

        v_cap.release()

        return faces

In [4]:
# Source: https://www.kaggle.com/timesler/facial-recognition-model-in-pytorch
def process_faces(faces, feature_extractor):
    
    # Filter out frames without faces
    faces = [f for f in faces if f is not None]
    if len(faces) == 0 or faces == None:
        return None
    
    embeddings = []
    for i, face in zip(range(len(faces)), faces):
        # Return a handful of faces
        if i==25:
            break
        
        # Transforming image to what TF wants
        face = einops.rearrange(face,'b c w h -> b w h c')
        #face = einops.rearrange(face, 'b c w h -> b (c h w)')
        
        # Extract the features
        feature = feature_extractor(tf.cast(face, tf.float32))
        embeddings.append(feature)
 
    # Calculate centroid for video and distance of each face's feature vector from centroid
    centroid = [sum(embed)/len(embeddings) for embed in zip(*embeddings)]
    
    distances = []
    for embed in embeddings:
        distance = [a-b for a,b in zip(embed, centroid)]
        
        # Normalize distance
        y = tf.linalg.normalize(distance[3])
        y = tf.convert_to_tensor(y[0])
        #print(6)
        # Scale values of tensor to be between 0 and 1
        y = tf.truediv(
            tf.subtract(
                y,
                tf.reduce_min(y)
            ),
            tf.subtract(
                tf.reduce_max(y),
                tf.reduce_min(y)
            )
        )
        
        # Global average pooling to reduce to face shape for training
        #dist = einops.reduce(y,'b w h c -> h w', 'mean')
        
        # Convert back to what Keras wants
        #dist = einops.rearrange(y,'b w h c -> b c w h')
        #for x in dist:
        #    distances.append(x)
        #distances.append(dist)
        
        for x in y:
            distances.append(x)
        
    #distances = np.asarray(distances).astype('float32')
    return distances

In [5]:
# Model for original frames
# From the paper https://www.robots.ox.ac.uk/~vgg/publications/2015/Parkhi15/parkhi15.pdf

model_frames = keras.Sequential()
model_frames.add(ZeroPadding2D((1,1),input_shape=(160,160, 3)))
model_frames.add(Convolution2D(64, (3, 3), activation='relu'))
#model_frames.add(ZeroPadding2D((1,1)))
#model_frames.add(Convolution2D(64, (3, 3), activation='relu'))
model_frames.add(MaxPooling2D((2,2), strides=(2,2)))
 
model_frames.add(ZeroPadding2D((1,1)))
model_frames.add(Convolution2D(128, (3, 3), activation='relu'))
#model_frames.add(ZeroPadding2D((1,1)))
#model_frames.add(Convolution2D(128, (3, 3), activation='relu'))
model_frames.add(MaxPooling2D((2,2), strides=(2,2)))
 
model_frames.add(ZeroPadding2D((1,1)))
model_frames.add(Convolution2D(256, (3, 3), activation='relu'))
#model_frames.add(ZeroPadding2D((1,1)))
#model_frames.add(Convolution2D(256, (3, 3), activation='relu'))
#model_frames.add(ZeroPadding2D((1,1)))
#model_frames.add(Convolution2D(256, (3, 3), activation='relu'))
model_frames.add(MaxPooling2D((2,2), strides=(2,2)))
 
model_frames.add(ZeroPadding2D((1,1)))
model_frames.add(Convolution2D(512, (3, 3), activation='relu'))
#model_frames.add(ZeroPadding2D((1,1)))
#model_frames.add(Convolution2D(512, (3, 3), activation='relu'))
#model_frames.add(ZeroPadding2D((1,1)))
#model_frames.add(Convolution2D(512, (3, 3), activation='relu'))
model_frames.add(MaxPooling2D((2,2), strides=(2,2)))
 
model_frames.add(ZeroPadding2D((1,1)))
model_frames.add(Convolution2D(512, (3, 3), activation='relu'))
#model_frames.add(ZeroPadding2D((1,1)))
#model_frames.add(Convolution2D(512, (3, 3), activation='relu'))
#model_frames.add(ZeroPadding2D((1,1)))
#model_frames.add(Convolution2D(512, (3, 3), activation='relu'))
model_frames.add(MaxPooling2D((2,2), strides=(2,2)))

model_frames.add(Convolution2D(4096, (5, 5), activation='relu'))
model_frames.add(Dropout(0.5))
model_frames.add(Convolution2D(4096, (1, 1), activation='relu'))
model_frames.add(Dropout(0.5))
model_frames.add(Convolution2D(2622, (1, 1)))
model_frames.add(Flatten())
model_frames.add(Activation('softmax'))

model_frames.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
zero_padding2d (ZeroPadding2 (None, 162, 162, 3)       0         
_________________________________________________________________
conv2d (Conv2D)              (None, 160, 160, 64)      1792      
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 80, 80, 64)        0         
_________________________________________________________________
zero_padding2d_1 (ZeroPaddin (None, 82, 82, 64)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 80, 80, 128)       73856     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 40, 40, 128)       0         
_________________________________________________________________
zero_padding2d_2 (ZeroPaddin (None, 42, 42, 128)       0

In [6]:
# model_faces for detected faces

model_faces = keras.Sequential()
model_faces.add(ZeroPadding2D((1,1),input_shape=(82,82,64)))
model_faces.add(Convolution2D(64, (64, 64), activation='relu'))
model_faces.add(ZeroPadding2D((1,1)))
model_faces.add(Convolution2D(64, (3, 3), activation='relu'))
model_faces.add(MaxPooling2D((2,2), strides=(2,2)))

model_faces.add(ZeroPadding2D((1,1)))
model_faces.add(Convolution2D(128, (3, 3), activation='relu'))
model_faces.add(ZeroPadding2D((1,1)))
model_faces.add(Convolution2D(128, (3, 3), activation='relu'))
model_faces.add(MaxPooling2D((2,2), strides=(2,2)))

model_faces.add(Convolution2D(1024, (5, 5), activation='relu'))
model_faces.add(Dropout(0.5))
model_faces.add(Convolution2D(1024, (1, 1), activation='relu'))
model_faces.add(Dropout(0.5))
model_faces.add(Convolution2D(2622, (1, 1)))
model_faces.add(Flatten())
model_faces.add(Dense(1, activation='sigmoid'))

model_faces.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
zero_padding2d_5 (ZeroPaddin (None, 84, 84, 64)        0         
_________________________________________________________________
conv2d_8 (Conv2D)            (None, 21, 21, 64)        16777280  
_________________________________________________________________
zero_padding2d_6 (ZeroPaddin (None, 23, 23, 64)        0         
_________________________________________________________________
conv2d_9 (Conv2D)            (None, 21, 21, 64)        36928     
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 10, 10, 64)        0         
_________________________________________________________________
zero_padding2d_7 (ZeroPaddin (None, 12, 12, 64)        0         
_________________________________________________________________
conv2d_10 (Conv2D)           (None, 10, 10, 128)      

In [7]:
feature_extractor = keras.Model(
    inputs=model_frames.inputs,
    outputs=[layer.output for layer in model_frames.layers],
)

In [8]:
# Load face detector
face_detector = MTCNN(margin=14, keep_all=True, factor=0.).eval()

# Load facial recognition model
# feature_extractor = InceptionResnetV1(pretrained='vggface2').eval()

# Define face detection pipeline
detection_pipeline = DetectionPipeline(detector=face_detector, n_frames=None, batch_size=128, resize=0.25)

In [9]:
with open('../resources/dataset/metadata.json') as f:
    data = json.load(f)

filenames = glob.glob('../resources/dataset/train_sample_videos/*.mp4')

counter=0
X = []
y = []
start = time.time()
n_processed = 0
with torch.no_grad():
    for i, filename in tqdm(enumerate(filenames), total=len(filenames)):
        #print(i, filename)

        try:
            # Load frames and find faces
            faces = detection_pipeline(filename)

            # Calculate distances of feature vectors
            #X.append(process_faces(faces, feature_extractor))

            features = process_faces(faces, feature_extractor)

            #if (len(z)!=0):
            if features != None:
                for z in features:
                    assert not np.any(np.isnan(z))
                    #print(f"z: {z}")
                    X.append(z)
                    #print(f"X: {X}")
                    if(data[filename[41:]]['label']=='FAKE'):
                        y.append(1)
                    else:
                        y.append(0)

        except KeyboardInterrupt:
            print('\nStopped.')
            break
        except Exception as e:
            print(e)
            #X.append(None)
            #y.append(None)

        n_processed += len(faces)
        print(f'Frames per second (load+detect+embed): {n_processed / (time.time() - start):6.3}\r', end='')

        # Converting back and forth because does not have enough memory to convert huge array at one time
        # Convert to array for split
        #X = np.asarray(X).astype('float32')
        #y = np.asarray(y)

        # Convert it back if there are new features to append
        #X = X.tolist()
        #y = y.tolist()

        if i==200:
            # Convert to array for split
            X = np.asarray(X).astype('float32')
            y = np.asarray(y)

            # Create a file if it is not exist
            f = open('X-200', 'w')
            f.close

            f = open('Y-200', 'w')
            f.close
            
            # Save result to a file
            file = open('X-200', 'wb')
            np.save(file, X)
            file.close

            file2 = open('Y-200', 'wb')
            np.save(file2, y)
            file2.close            
        
            break

  0%|          | 0/400 [00:00<?, ?it/s]

  batch_boxes, batch_points = np.array(batch_boxes), np.array(batch_points)
  boxes = np.array(boxes)
  points = np.array(points)


Frames per second (load+detect+embed):   28.7

  probs = np.array(probs)


Frames per second (load+detect+embed):   27.5
Frames per second (load+detect+embed):   26.1
Frames per second (load+detect+embed):   26.3
Incompatible shapes: [2,162,162,3] vs. [3,162,162,3] [Op:AddV2]
Frames per second (load+detect+embed):   26.2
Frames per second (load+detect+embed):   26.3
Frames per second (load+detect+embed):   26.3
Incompatible shapes: [3,162,162,3] vs. [2,162,162,3] [Op:AddV2]
Frames per second (load+detect+embed):   26.2

In [19]:
result = open('X-200', 'rb')
X = np.load(result, allow_pickle=True)

result2 = open('Y-200', 'rb')
y = np.load(result2, allow_pickle=True)

In [21]:
print(X.shape, X.dtype)
print(y.shape, y.dtype)

(554, 82, 82, 64) float32
(17,) int64


In [18]:
# 80:20 for split size
size = int(len(X) * 0.8)

x_train, x_test = X[:size], X[size:]
y_train, y_test = y[:size], y[size:]

In [None]:
# To clear memory
import gc
del model_frames
del X
gc.collect()

In [None]:
model_faces.compile(
    optimizer=SGD(learning_rate=0.01, momentum=0.9),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=['accuracy']
)

In [None]:
history = model_faces.fit(
    x_train,
    y_train,
    batch_size=128,
    epochs=100,
    use_multiprocessing=True,
    workers=num_cpu
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100

In [None]:
# Export model into a file
model_faces.save('model.h5')
#del model