# Libraries Used

* ffmpeg-python
* av
* cmake
* dlib  (based on the python version)
* face-recognition

# Imports & Globals

In [2]:
import av
import face_recognition

import numpy as np
import os
import random

In [10]:
DS_ORG = './dataset_original/'
DS_IFRAME = './dataset_IFrames/'
DS_FACE = './dataset_face/'
DS_FINAL = './dataset_final/'
DS_SEG = './dataset_segments/'

CELEB_REAL = 'Celeb-real/'
CELEB_FAKE = 'Celeb-synthesis/'
YT_REAL = 'YouTube-real/'

SEG = ['seg_1_', 'seg_2_', 'seg_3_']

# I-Frame Extraction

## Testing Logic

In [None]:
test_vid = av.open('dataset_original/Celeb-real/id0_0000.mp4')

for packet in test_vid.demux():
    for frame in packet.decode():
        print(f'{frame.pict_type} - {frame.key_frame}')

In [77]:
test_input = av.open('dataset_original/Celeb-real/id0_0000.mp4')
test_output = av.open('dataset_IFrames/id0_0000.mp4', 'w')

in_stream = test_input.streams.video[0]
in_stream.codec_context.skip_frame = "NONKEY"

out_stream = test_output.add_stream(template=in_stream)

for packet in test_input.demux(in_stream):
    if packet.dts is None:
        continue

    if packet.is_keyframe:
        packet.stream = out_stream
        test_output.mux(packet)

test_input.close()
test_output.close()

In [9]:
count = 0
for video in os.listdir(DS_ORG + CELEB_REAL):
    count += 1
    if (count == 10):
        break

    input_vid = av.open(DS_ORG + CELEB_REAL + video)
    output_vid = av.open(DS_IFRAME + CELEB_REAL + video, 'w')

    in_stream = input_vid.streams.video[0]
    in_stream.codec_context.skip_frame = "NONKEY"

    out_stream = output_vid.add_stream(template=in_stream)

    for packet in input_vid.demux(in_stream):
        if packet.dts is None:
            continue
    
        if packet.is_keyframe:
            packet.stream = out_stream
            output_vid.mux(packet)

    input_vid.close()
    output_vid.close()


## Implementation

In [78]:
def extract_frames(src_dir, dest_dir, vid_class, filename):
    input_vid = av.open(src_dir + vid_class + filename)
    output_vid = av.open(dest_dir + vid_class + filename, 'w')

    in_stream = input_vid.streams.video[0]
    in_stream.codec_context.skip_frame = "NONKEY"

    out_stream = output_vid.add_stream(template=in_stream)

    for packet in input_vid.demux(in_stream):
        if packet.dts is None:
            continue

        if packet.is_keyframe:
            packet.stream = out_stream
            output_vid.mux(packet)

    input_vid.close()
    output_vid.close()

In [79]:
# Extracting I-Frames from real celebrity videos

for video in os.listdir(DS_ORG + CELEB_REAL):
    extract_frames(DS_ORG, DS_IFRAME, CELEB_REAL, video)

In [12]:
# Extracting I-Frames from real youtube videos

for video in os.listdir(DS_ORG + YT_REAL):
    extract_frames(DS_ORG, DS_IFRAME, YT_REAL, video)

In [13]:
# Extracting I-Frames from deepfake celebrity videos
# 408 vidoes chosen at random to ensure equal amount of real and fake vidoes 
# used for training. (158 real celeb videos + 250 real youtube vidoes)

video_list = random.sample(os.listdir(DS_ORG + CELEB_FAKE), 408)
for video in video_list:
    extract_frames(DS_ORG, DS_IFRAME, CELEB_FAKE, video)

# Face Extraction

## Functions

In [3]:
# MesoNet works best with images having 256x256 dimension
# If face location borders span a smaller distance, extend the borders
# on either side equally to ensure 256x256 image

def normalize_face_borders(low, high):
    diff = high - low
    if diff >= 256:
        return

    x = diff / 2
    if (low >= x): 
        low -= x
    else:
        x = x + (x - low) + (1 if diff % 2 == 1 else 0)
        low = 0

    high += x

    return low, high

In [6]:
# New normalize function to always make the cropped face image 256x256 dimension
# which will be fed as input to the MesoNet

def modified_normalize_face_borders(low, high, boundary):
    diff = high - low

    if diff <= 256:
        offset = 256 - diff
        low = max(0, min(low - offset / 2 , low))
        high = min(boundary, max(high + (offset - offset / 2), high))

    return low, high

In [5]:
def get_crop_window(face_location, height, width):
    face_location = (face_location[0][3], height - face_location[0][0], face_location[0][1], height - face_location[0][2])

    left, right = modified_normalize_face_borders(face_location[0], face_location[2], width)
    bot, top = modified_normalize_face_borders(face_location[3], face_location[1], height)

    face_location = (left, height - top, right, height - bot)

    return face_location

## Testing Logic

In [67]:
test_input = av.open('dataset_IFrames/Celeb-real/id0_0000.mp4')

count = 0

for frame in test_input.decode():
    nd_frame = frame.to_ndarray()
    img_frame = frame.to_image()

    height, width = img_frame.height, img_frame.width

    # Face location returned by face_recognition api: [(top, right, bottom, left)] in css terms
    # Face location required by PIL.Image: (left, top, right, bottom)
    face_location = face_recognition.api.face_locations(nd_frame)
    face_location = get_crop_window(face_location, height, width)
    
    img_frame = img_frame.crop(face_location)
    img_frame.save(f'dataset_face/Celeb-real/id0_0000_{count}.jpg')

    count += 1 

### Turning Cropped faces to a Video

In [13]:
test_input = av.open('dataset_IFrames/Celeb-real/id10_0001.mp4')
test_output = av.open('dataset_Face/Celeb-real/id10_0001.mp4', 'w')

in_stream = test_input.streams.video[0]
codec_name = in_stream.codec_context.name

out_stream = test_output.add_stream(codec_name, 2)
out_stream.width = in_stream.codec_context.width
out_stream.height = in_stream.codec_context.height
out_stream.pix_fmt = in_stream.codec_context.pix_fmt

for frame in test_input.decode(in_stream):
    img_frame = frame.to_image()
    nd_frame = frame.to_ndarray()

    height, width = img_frame.height, img_frame.width

    # Face location returned by face_recognition api: [(top, right, bottom, left)]
    # Face location required by PIL.Image: (left, top, right, bottom)
    face_location = face_recognition.api.face_locations(nd_frame)
    
    if len(face_location) == 0:
        continue

    # face_location = (face_location[0][3], face_location[0][0], face_location[0][1], face_location[0][2])

    # left, right = normalize_face_borders(face_location[0], face_location[2])
    # bot, top = normalize_face_borders(face_location[3], face_location[1])
    # face_location = (left, top, right, bot)

    face_location = get_crop_window(face_location, height, width)
    img_frame = img_frame.crop(face_location)

    out_frame = av.VideoFrame.from_image(img_frame)
    out_packet = out_stream.encode(out_frame)
    test_output.mux(out_packet)

out_packet = out_stream.encode(None)
test_output.mux(out_packet)

test_input.close()
test_output.close()

bitrate tolerance 128000 too small for bitrate 1024000, overriding


## Implementation

In [8]:
def save_cropped_faces_to_video(src_dir, dest_dir, vid_class, filename):
    
    input = av.open(src_dir + vid_class + filename)
    output = av.open(dest_dir + vid_class + filename, 'w')

    in_stream = input.streams.video[0]
    codec_name = in_stream.codec_context.name

    # output video dimension should be 256x256
    out_stream = output.add_stream(codec_name, 2)
    out_stream.width = 256
    out_stream.height = 256
    out_stream.pix_fmt = in_stream.codec_context.pix_fmt

    for frame in input.decode(in_stream):
        img_frame = frame.to_image()
        nd_frame = frame.to_ndarray()

        height, width = img_frame.height, img_frame.width

        # Face location returned by face_recognition api: [(top, right, bottom, left)]
        # Face location required by PIL.Image: (left, top, right, bottom)
        face_location = face_recognition.api.face_locations(nd_frame)

        # can't find a face, then skip that frame
        # TODO : sync frame skipping with temporality stream
        if len(face_location) == 0:
            continue
            
        face_location = get_crop_window(face_location, height, width)
        img_frame = img_frame.crop(face_location)
        
        out_frame = av.VideoFrame.from_image(img_frame)
        out_packet = out_stream.encode(out_frame)
        output.mux(out_packet)

    out_packet = out_stream.encode(None)
    output.mux(out_packet)

    input.close()
    output.close()

## Simple Method to save cropped faces to video

The codec resizes the video according to specified dimension.
The face_location from face_recognition api can be directly used without normalizing borders.

In [11]:
def simple_save_cropped_faces_to_video(src_dir, dest_dir, vid_class, filename):
    
    input = av.open(src_dir + vid_class + filename)
    output = av.open(dest_dir + vid_class + filename, 'w')

    in_stream = input.streams.video[0]
    codec_name = in_stream.codec_context.name

    # output video dimension should be 256x256
    out_stream = output.add_stream(codec_name, 2)
    out_stream.width = 256
    out_stream.height = 256
    out_stream.pix_fmt = in_stream.codec_context.pix_fmt

    for frame in input.decode(in_stream):
        img_frame = frame.to_image()
        nd_frame = frame.to_ndarray()

        height, width = img_frame.height, img_frame.width

        # Face location returned by face_recognition api: [(top, right, bottom, left)]
        # Face location required by PIL.Image: (left, top, right, bottom)
        face_location = face_recognition.api.face_locations(nd_frame)

        # can't find a face, then skip that frame
        # TODO : sync frame skipping with temporality stream
        if len(face_location) == 0:
            continue
        
        # since the codec resizes the video depending on specified dimension
        # no need to normalize borders
        face_location = (face_location[0][3], face_location[0][0], face_location[0][1], face_location[0][2])
        img_frame = img_frame.crop(face_location)
        
        out_frame = av.VideoFrame.from_image(img_frame)
        out_packet = out_stream.encode(out_frame)
        output.mux(out_packet)

    out_packet = out_stream.encode(None)
    output.mux(out_packet)

    input.close()
    output.close()

In [None]:
for video in os.listdir(DS_IFRAME + CELEB_REAL):
    simple_save_cropped_faces_to_video(DS_IFRAME, DS_FACE, CELEB_REAL, video)

In [None]:
for video in os.listdir(DS_IFRAME + CELEB_FAKE):
    save_cropped_faces_to_video(DS_IFRAME, DS_FACE, CELEB_FAKE, video)

In [None]:
for video in os.listdir(DS_IFRAME + YT_REAL):
    save_cropped_faces_to_video(DS_IFRAME, DS_FACE, YT_REAL, video)

## Dividing videos into Segments of 3


#### Implementation

In [22]:
# Segment each video into 3 segments
def segment_video(src_dir, dest_dir, vid_class, filename):
    
    input = av.open(src_dir + vid_class + filename)

    in_stream = input.streams.video[0]
    total_frames = in_stream.frames
    
    frames_per_segment = total_frames / 3

    codec_name = in_stream.codec_context.name

    count = 1
    seg_no = 0

    # output video dimension should be 224x224
    output = av.open(dest_dir + vid_class + SEG[seg_no] + filename, 'w')
    out_stream = output.add_stream(codec_name, 2)
    out_stream.width = 224
    out_stream.height = 224
    out_stream.pix_fmt = in_stream.codec_context.pix_fmt

    for frame in input.decode(in_stream):
        img_frame = frame.to_image()
        nd_frame = frame.to_ndarray()

        height, width = img_frame.height, img_frame.width

        if seg_no < 2 and count > frames_per_segment:
            count = 1
            seg_no += 1
            out_packet = out_stream.encode(None)
            output.mux(out_packet)
            output.close()
            output = av.open(dest_dir + vid_class + SEG[seg_no] + filename, 'w')
            out_stream = output.add_stream(codec_name, 2)
            out_stream.width = 224
            out_stream.height = 224
            out_stream.pix_fmt = in_stream.codec_context.pix_fmt

        out_frame = av.VideoFrame.from_image(img_frame)
        out_packet = out_stream.encode(out_frame)
        output.mux(out_packet)

        count += 1

    out_packet = out_stream.encode(None)
    output.mux(out_packet)
    input.close()
    output.close()

In [None]:
# Extracting 3-Segments from real celebrity videos

for video in os.listdir(DS_ORG + CELEB_REAL):
    segment_video(DS_ORG, DS_SEG, CELEB_REAL, video)
    break

In [None]:
# Extracting 3-Segments from fake celebrity videos

for video in os.listdir(DS_ORG + CELEB_REAL):
    segment_video(DS_ORG, DS_SEG, CELEB_FAKE, video)
    break

In [None]:
# Extracting 3-Segments from real youtube videos

for video in os.listdir(DS_ORG + CELEB_REAL):
    segment_video(DS_ORG, DS_SEG, YT_REAL, video)
    break

## CoViAR (Compressed Video Action Recognition)