In [15]:
import av
import os
import face_recognition
from PIL import Image

import numpy as np
import shutil
import math
import random

import tensorflow as tf
import tensorflow_model_optimization as tfmot

import keras
from keras import layers
from keras.layers import Conv2D, MaxPool2D, Flatten, Dense, BatchNormalization, Dropout, LeakyReLU
from keras.optimizers import Adam
from keras.utils import img_to_array

In [2]:
INPUT_DIR = 'input_files/'
INTERMEDIARY_DIR = 'intermediary_files/'
OUTPUT_DIR = 'output_files/'

TOP_DIRS = [INPUT_DIR, INTERMEDIARY_DIR, OUTPUT_DIR]

IFRAME_DIR = 'iframes/'
FACECROP_DIR = 'faces/'
RESIDUAL_DIR = 'residual/'

PREPROCESS_DIRS = [IFRAME_DIR, FACECROP_DIR, RESIDUAL_DIR]

MESONET_PATH  = 'saved_models/mesonet/'
SRM_PATH      = 'saved_models/srm/'
TEMPORAL_PATH = 'saved_models/temporal/'

In [47]:
for dir in TOP_DIRS:
    try:
        os.makedirs(dir)
    except Exception:
        pass

for dir in PREPROCESS_DIRS:
    try:
        os.makedirs(INTERMEDIARY_DIR + dir)
    except Exception:
        pass

# Flow

- Read all videos present in input_files folder
- For each video in the input directory
    - Extract I-Frames and crop faces
    - Extract Extract residuals
    - save face-cropped video and residuals video
    - In Frame-level stream
        - Extract all frames in face-cropped video
        - Take average of prediction results as video score
    - In SRM stream
        - Extract snippets from face-cropped video
        - Take average of prediction results as video score
    - In Temporal stream
        - Extract all residuals from residual video
        - Take average of prediction per segment
        - Select the most extreme value as video score (closest to 0 or 1)
    - In score aggregation
        - Take average of three scores
        - Use voting to determine class (Use extreme value of major class as video score)
        - Use trained svm model to predict class probabilities

# Functions

## Pre-Processing

In [34]:
def extract_iframes(fp):
    input_vid = av.open(fp)
    output_vid = av.open(INTERMEDIARY_DIR + IFRAME_DIR + os.path.split(fp)[1], 'w')

    in_stream = input_vid.streams.video[0]
    in_stream.codec_context.skip_frame = "NONKEY"

    out_stream = output_vid.add_stream(template=in_stream)

    for packet in input_vid.demux(in_stream):
        if packet.dts is None:
            continue

        if packet.is_keyframe:
            packet.stream = out_stream
            output_vid.mux(packet)

    input_vid.close()
    output_vid.close()

In [35]:
# MesoNet works best with images having 256x256 dimension
# If face location borders span a smaller distance, extend the borders
# on either side equally to ensure 256x256 image

def normalize_face_borders(low, high, max_val, req_dim):
    diff = high - low
    if diff >= 256:
        return

    offset = float((req_dim - diff)) / 2
    low = max(0, low - offset)
    high = min(max_val, high + offset)

    return low, high

In [36]:
# Face Location: (left, top, right, bottom)
def modify_crop_window(face_location, height, width, req_dim):
    left, right = normalize_face_borders(face_location[0], face_location[2], width, req_dim)
    top, bot = normalize_face_borders(face_location[1], face_location[3], height, req_dim)

    face_location = (left, top, right, bot)

    return face_location

In [37]:
def save_cropped_faces_to_video(fp, req_dim):
    input = av.open(fp)
    output = av.open(INTERMEDIARY_DIR + FACECROP_DIR + os.path.split(fp)[1], 'w')

    in_stream = input.streams.video[0]
    codec_name = in_stream.codec_context.name

    # output video dimension should be 256x256
    out_stream = output.add_stream(codec_name, rate=8)
    out_stream.width = 256
    out_stream.height = 256
    out_stream.pix_fmt = in_stream.codec_context.pix_fmt

    for frame in input.decode(in_stream):
        img_frame = frame.to_image()
        nd_frame = frame.to_ndarray()

        # Face location returned by face_recognition api: [(top, right, bottom, left)]
        # Origin considered at top left corner of image => right margin > left margin, bottom > top
        face_location = face_recognition.api.face_locations(nd_frame)

        # if can't find a face, then skip that frame
        # TODO : sync frame skipping with temporality stream
        if len(face_location) == 0:
            continue

        # Face location required by PIL.Image: (left, top, right, bottom)
        face_location = (face_location[0][3], face_location[0][0], 
                         face_location[0][1], face_location[0][2])
            
        # Modify crop window size only if positive value given.
        if (req_dim > 0):    
            face_location = modify_crop_window(face_location, img_frame.height, img_frame.width, req_dim)
            
        img_frame = img_frame.crop(face_location)
        
        out_frame = av.VideoFrame.from_image(img_frame)
        out_packet = out_stream.encode(out_frame)
        output.mux(out_packet)

    out_packet = out_stream.encode(None)
    output.mux(out_packet)

    input.close()
    output.close()

In [38]:
def compute_residual(a, b):
    return Image.fromarray(np.asarray(a) - np.asarray(b))

In [39]:
def extract_residuals(fp):
    input_vid = av.open(fp)
    output_vid = av.open(INTERMEDIARY_DIR + RESIDUAL_DIR + os.path.split(fp)[1], 'w')

    in_stream = input_vid.streams.video[0]
    codec_name = in_stream.codec_context.name

    # output video dimension should be 256x256
    out_stream = output_vid.add_stream(codec_name, rate=8)
    out_stream.width = 224
    out_stream.height = 224
    out_stream.pix_fmt = in_stream.codec_context.pix_fmt

    # Extract residuals
    frame_list = [frame for frame in input_vid.decode()]
    
    input_vid.seek(0)
    iframe_index = [i for i, packet in enumerate(input_vid.demux()) if packet.is_keyframe]

    residuals = []
    gop_start_index = 0
    for index in iframe_index:
        if index == 0:
            continue

        residual = compute_residual(frame_list[index - 1].to_image(), frame_list[gop_start_index].to_image())
        out_frame = av.VideoFrame.from_image(residual)
        out_packet = out_stream.encode(out_frame)
        output_vid.mux(out_packet)

        gop_start_index = index

    residual = compute_residual(frame_list[-1].to_image(), frame_list[gop_start_index].to_image())
    out_frame = av.VideoFrame.from_image(residual)
    out_packet = out_stream.encode(out_frame)
    output_vid.mux(out_packet)

    out_packet = out_stream.encode(None)
    output_vid.mux(out_packet)

    input_vid.close()
    output_vid.close()

## Models

In [19]:
class VideoScore():
    def __init__(self, filename, score, fake_on_lower_half = True):
        self.filename = filename
        self.score = score
        self.fake_on_lower_half = fake_on_lower_half

    def get_filename(self):
        return self.filename
    
    def get_score(self):
        return self.score
    
    def get_fake_on_lower_half(self):
        return self.fake_on_lower_half

In [20]:
# Returns the index of frames that begin a new segment (except the first segment)
def get_segment_dividers(frame_count, num_segments):
    segments_per_frame = math.floor(frame_count / num_segments)

    return [(segments_per_frame * i) for i in range(1, num_segments) ]

In [21]:
# Returns the indices of the frames that will be randomly selected from each segment
# Multiple snippets indices per segment can be returned by setting the num_snippets arg 
def get_snippet_indices(segment_dividers, num_snippets):
    start_index = 0
    num_snippets = 1 if num_snippets <= 0 else num_snippets

    snippet_indices = []
    for end_index in segment_dividers:

        # Extracting multiple snippets per segment (if needed)
        for _ in range(num_snippets):
            snippet_indices.append(random.randint(start_index, end_index - 1))

        start_index = end_index
        
    return snippet_indices

In [22]:
# Returns an array of randomly selected snippets(PIL.Image) from each segment of the input video
def extract_snippets(fp, num_segments, num_snippets):
    vid_container = av.open(fp)
    vid_stream = vid_container.streams.video[0]
    frame_count = vid_stream.frames

    snippets = []

    # If number of frames in video is less than the number of frames that need to sampled
    # then take all frames in the video
    if frame_count < num_segments * num_snippets:
        for frame in vid_container.decode():
            snippets.append(frame.to_image())

    else:
        segment_dividers = get_segment_dividers(frame_count, num_segments)
        segment_dividers = segment_dividers + [frame_count]

        snippet_indices = get_snippet_indices(segment_dividers, num_snippets)

        frame_index = 0
        for frame in vid_container.decode():
            if frame_index > max(snippet_indices):
                break

            if frame_index in snippet_indices:
                snippets.append(frame.to_image())

            frame_index += 1

    return snippets

### Mesonet Stream

In [None]:
def create_model(input_size):
    model = keras.Sequential(name='Mesonet')
    model.add(layers.Conv2D(input_shape=input_size, filters=8, kernel_size=3, activation='relu', padding="same"))
    model.add(BatchNormalization())
    model.add(MaxPool2D(2, 2, padding="same"))

    model.add(layers.Conv2D(input_shape=(128, 128, 8), filters=8, kernel_size=5, activation='relu', padding="same"))
    model.add(BatchNormalization())
    model.add(MaxPool2D(2, 2, padding="same"))

  
    model.add(layers.Conv2D(input_shape=(64, 64, 8), filters=16, kernel_size=5, activation='relu', padding="same"))
    model.add(BatchNormalization())
    model.add(MaxPool2D(4, 4, padding="same"))

  
    model.add(layers.Conv2D(input_shape=(16, 16, 16), filters=16, kernel_size=5, activation='relu', padding="same"))
    model.add(BatchNormalization())
    model.add(MaxPool2D(4, 4, padding="same"))
    model.add(Flatten())

    model.add(Dropout(0.5))
    model.add(layers.Dense(16))
    model.add(layers.LeakyReLU())

    model.add(Dropout(0.5))
    model.add(layers.Dense(1, activation='sigmoid'))
  
    return model

In [None]:
input_size = (256, 256, 3)
mesonet_model = create_model(input_size)
mesonet_model.compile(optimizer=Adam(learning_rate=0.0001), 
              loss=keras.losses.BinaryCrossentropy(), 
              metrics = [keras.metrics.BinaryAccuracy(), 
                         keras.metrics.Precision(), 
                         keras.metrics.Recall(),
                         keras.metrics.AUC(),
                         keras.metrics.FalseNegatives(),
                         keras.metrics.FalsePositives(),
                         keras.metrics.TrueNegatives(),
                         keras.metrics.TruePositives()])
mesonet_model.summary()

Model: "Mesonet"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 256, 256, 8)       224       
                                                                 
 batch_normalization (BatchN  (None, 256, 256, 8)      32        
 ormalization)                                                   
                                                                 
 max_pooling2d (MaxPooling2D  (None, 128, 128, 8)      0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 128, 128, 8)       1608      
                                                                 
 batch_normalization_1 (Batc  (None, 128, 128, 8)      32        
 hNormalization)                                                 
                                                           

### SRM Stream

### Temporal Stream

In [23]:
resnet50v2 = tf.keras.applications.ResNet50V2(include_top=False)

In [24]:
resnet50v2.trainable = True

# TODO: Freezing all BN layers except for first one (can be done better)
# Freeze all BN layers
for layer in resnet50v2.layers:
    if isinstance(layer, keras.layers.BatchNormalization):
        layer.trainable = False

# Unfreeze first BN layer
for layer in resnet50v2.layers:
    if isinstance(layer, keras.layers.BatchNormalization):
        layer.trainable = True
        break

In [25]:
inputs = keras.layers.Input((224, 224, 3))
x = tf.keras.applications.resnet_v2.preprocess_input(inputs)
x = resnet50v2(x)
x = Flatten()(x)
x = Dropout(0.8)(x)
x = Dense(100, activation=LeakyReLU())(x)
x = Dropout(0.8)(x)
out = Dense(1, activation='sigmoid')(x)

temporal_model = keras.Model(inputs, out, name="temporal_stream")

In [26]:
temporal_model.compile(optimizer = Adam(learning_rate = 0.00001), 
              loss = keras.losses.BinaryCrossentropy(), 
              metrics = [keras.metrics.BinaryAccuracy(), 
                         keras.metrics.Precision(), 
                         keras.metrics.Recall(),
                         keras.metrics.AUC(),
                         keras.metrics.FalseNegatives(),
                         keras.metrics.FalsePositives(),
                         keras.metrics.TrueNegatives(),
                         keras.metrics.TruePositives()],
             )

In [27]:
temporal_model.load_weights(TEMPORAL_PATH + 'checkpoint_final_sig')

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x14dcba81e10>

In [28]:
def get_residuals(fp, num_segments):
    vid_container = av.open(fp)
    vid_stream = vid_container.streams.video[0]
    frame_count = vid_stream.frames

    segment_dividers = get_segment_dividers(frame_count, num_segments)

    vid_container.seek(0)
    frame_list = [frame.to_image() for frame in vid_container.decode()]

    residuals = []
    start_index = 0
    for sd in segment_dividers:
        residuals.append(frame_list[start_index:sd])
        start_index = sd
    
    residuals.append(frame_list[start_index:])

    return residuals

In [29]:
def calculate_temporal_score(model, fp):
    residuals = get_residuals(fp, num_segments=3)

    results = []
    for residual_set in residuals:
        tf_frames = []

        for frame in residual_set:
            tf_frames.append(img_to_array(tf.image.resize(frame, size = [224, 224])))

        tf_frames = np.asarray(tf_frames)
        result = model.predict(tf_frames, verbose=0)
        results.append(np.average(result))

    max_val = np.max(results)
    min_val = np.min(results)

    return max_val if 1 - max_val < min_val else min_val

# Execution

In [40]:
def process_video(fp):
    extract_iframes(fp)
    save_cropped_faces_to_video(INTERMEDIARY_DIR + IFRAME_DIR + os.path.split(fp)[1], -1)
    extract_residuals(fp)
    t_score = calculate_temporal_score(temporal_model, INTERMEDIARY_DIR + RESIDUAL_DIR + os.path.split(fp)[1])

    print(t_score)

In [44]:
filename='temp_r1.mp4'
# filename=None

# If no filename was given, process all videos in input directory
if filename == None or not os.path.exists(filename):
    for video in os.listdir(INPUT_DIR):
        process_video(INPUT_DIR + video)

        print(f'Video processed: {video}')

# If filename is a valid file in root directory, process only that file
else:
    process_video(filename)

0.33128887
