# DeepFake data

## Handling raw data from Kaggle

In [4]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_json("../data/dfdc_train_part_5/metadata.json", 
                    orient="index").reset_index().rename(columns={'index': 'video'})
df.label.value_counts()

FAKE    2123
REAL     360
Name: label, dtype: int64

1. Creating a balanced dataset with all `REAL` videos (360) and an equal number of randomly selected `FAKE`s.
2. Splitting into training (80%) and validation (20%). 

In [5]:
data_df = df.loc[df.label=="REAL"].append(df.loc[df.label=="FAKE"].sample(360, random_state=0))
data_df = data_df.sample(len(data_df), random_state=0)
data_df["split"] = int(len(data_df)*0.2) * ["valid"] + int(len(data_df)*0.8) * ["train"]
data_df["target"] = np.where(data_df.label=="FAKE", 1, 0)
data_df["filepath"] = data_df.video.apply(lambda x: "/home/ubuntu/data/dfdc_train_part_5/"+x)
data_df.head()

Unnamed: 0,video,label,split,original,target,filepath
1883,aonaupzilc.mp4,FAKE,valid,btsbqigrcq.mp4,1,/home/ubuntu/data/dfdc_train_part_5/aonaupzilc...
1501,wbhxgtaswg.mp4,FAKE,valid,ubarhvreoj.mp4,1,/home/ubuntu/data/dfdc_train_part_5/wbhxgtaswg...
2161,dfjueqzmfx.mp4,REAL,valid,,0,/home/ubuntu/data/dfdc_train_part_5/dfjueqzmfx...
1901,tgwvzoncvd.mp4,REAL,valid,,0,/home/ubuntu/data/dfdc_train_part_5/tgwvzoncvd...
1705,knbsiylwur.mp4,FAKE,valid,ebuebnxpdq.mp4,1,/home/ubuntu/data/dfdc_train_part_5/knbsiylwur...


In [6]:
data_df.groupby(["split", "label"]).video.count()

split  label
train  FAKE     284
       REAL     292
valid  FAKE      76
       REAL      68
Name: video, dtype: int64

Saving filepath and binary target in TSV format as expected by PyTorchVideo (this is just one of the many ways to feed data into a PyTorchVideo model)

In [None]:
data_df.loc[data_df.split=="valid", ["filepath", "target"]].to_csv("valid.csv",
                                                                   sep="\t", index=False, header=None)
data_df.loc[data_df.split=="train", ["filepath", "target"]].to_csv("train.csv", 
                                                                   sep="\t", index=False, header=None)

In [10]:
! tail train.csv

/home/ubuntu/data/dfdc_train_part_5/hmnspetjoc.mp4	0
/home/ubuntu/data/dfdc_train_part_5/yynszivwba.mp4	1
/home/ubuntu/data/dfdc_train_part_5/csomiogcwu.mp4	0
/home/ubuntu/data/dfdc_train_part_5/lfiyoyymnn.mp4	0
/home/ubuntu/data/dfdc_train_part_5/tohrqjyter.mp4	0
/home/ubuntu/data/dfdc_train_part_5/hblkonvlyq.mp4	1
/home/ubuntu/data/dfdc_train_part_5/xsgmwwbeib.mp4	0
/home/ubuntu/data/dfdc_train_part_5/emsnbjdsve.mp4	1
/home/ubuntu/data/dfdc_train_part_5/bxtaeanoyr.mp4	1
/home/ubuntu/data/dfdc_train_part_5/bpzmcgorle.mp4	1


## Face cropping

Taking a look at the data it seemed apparent that, for deepfake videos, the manipulation had occurred in the facial region of the displayed individual.

Given the original MP4s are `1928 x 1080` pixels, and the face occupies only a small region of the frame, we are feeding the model with a lot of useless information. 

On top of it, as we have to resize the original frames anyway as they are too big to fit on GPU, we are losing useful signal.

So, I shamelessly copied the approach showcased in [this](https://www.kaggle.com/timesler/fast-mtcnn-detector-55-fps-at-full-resolution) Kaggle kernel.

The idea is to use MTCNN (shipped with the [`facenet_pytorch`](https://github.com/timesler/facenet-pytorch) package) to detect faces every N frames of each video and stack them together into a new MP4 file.

We will get away lighter result, 10 vs 30 original FPS and `256 x 256` vs `1928 x 1080` size, focused on our region of interest.

In [11]:
from facenet_pytorch import MTCNN
from PIL import Image
import torch
from imutils.video import FileVideoStream
import cv2
from pathlib import Path
from tqdm.notebook import tqdm
from PIL import Image
import numpy as np
import ffmpeg
import pandas as pd

device = 'cuda' if torch.cuda.is_available() else 'cpu'

tr = pd.read_csv("train.csv", sep="\t", names=["fname", "label"])
va = pd.read_csv("valid.csv", sep="\t", names=["fname", "label"])
df = tr.append(va)
filenames = df.fname.tolist()
len(filenames)

720

In [12]:
def to_image(array): return Image.fromarray(np.moveaxis(array.squeeze().numpy().astype(np.uint8),0, -1))

class FastMTCNN(object):
    """Fast MTCNN implementation."""
    
    def __init__(self, stride, resize=1, *args, **kwargs):
        """Constructor for FastMTCNN class.
        
        Arguments:
            stride (int): The detection stride. Faces will be detected every `stride` frames
                and remembered for `stride-1` frames.
        
        Keyword arguments:
            resize (float): Fractional frame scaling. [default: {1}]
            *args: Arguments to pass to the MTCNN constructor. See help(MTCNN).
            **kwargs: Keyword arguments to pass to the MTCNN constructor. See help(MTCNN).
        """
        self.stride = stride
        self.resize = resize
        self.mtcnn = MTCNN(*args, **kwargs)
        
    def __call__(self, frames):
        """Detect faces in frames using strided MTCNN."""
        if self.resize != 1:
            frames = [
                cv2.resize(f, (int(f.shape[1] * self.resize), int(f.shape[0] * self.resize)))
                    for f in frames
            ]
                      
        faces = self.mtcnn.forward(frames[::self.stride])
        return faces

class FacesVideo:
    def __init__(self, filenames, fast_mtcnn, batch_size=60, framerate=10):
        self.filenames = filenames
        self.fast_mtcnn = fast_mtcnn
        self.batch_size = batch_size
        self.framerate = framerate

    def extract_faces(self, filename):
        frames = []

        v_cap = FileVideoStream(filename).start()
        v_len = int(v_cap.stream.get(cv2.CAP_PROP_FRAME_COUNT))
        faces = []

        for j in range(v_len):

            frame = v_cap.read()
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(frame)

            if len(frames) >= self.batch_size or j == v_len - 1:
                faces += self.fast_mtcnn(frames)
                frames = []

        v_cap.stop()
        faces = self.check_faces(faces)

        return faces

    def get_first_non_none(self, faces):
        for face in faces:
            if face is not None:
                return face

    def check_faces(self, faces):
        for i, face in enumerate(faces):
            if face is None:
                if i > (len(faces)-10):
                    faces[i] = self.get_first_non_none(faces[(i-10):])
                else:
                    faces[i] = self.get_first_non_none(faces[i:])
        return faces

    def save_frames(self, faces, filename):
        new_path = Path(filename.replace(".mp4", "").replace("dfdc_train_part_5", "faces/frames"))
        new_path.mkdir(parents=True, exist_ok=True)

        for i, face in enumerate(faces):
            to_image(face).save(new_path / f"{str(i).zfill(3)}.jpg")
        return new_path

    def collate_video(self, path): 
        movie_path = str(path).replace("frames", "videos") + "_face.mp4"
        ffmpeg.input(str(path) + '/*.jpg', pattern_type='glob', framerate=self.framerate).output(movie_path).overwrite_output().run()

    def get_face_videos(self, filename):
        faces = self.extract_faces(filename)
        faces_path = self.save_frames(faces, filename)
        self.collate_video(faces_path) 
        
        
    def run(self): 
        for f in tqdm(self.filenames):
            self.get_face_videos(f)

In [None]:
fast_mtcnn = FastMTCNN(
    stride=3,
    resize=1,
    margin=30,
    factor=0.6,
    keep_all=False,
    device=device,
    post_process=False,
)

# here we use a `framerate` of 10 FPS. 
# 3 times less compares to the 
fv = FacesVideo(filenames, fast_mtcnn)
fv.run()

Take a sneak peek into videos.

### Face cropped video...

In [18]:
from IPython.display import Video

# looking into FAKE (resampled) videos (`label==1`)
# fn = df.loc[df.label==1].fname.sample().values[0]
path = "/home/ubuntu/data/dfdc_train_part_5/iwladlmomt.mp4"
fn = path.replace(".mp4", "_face.mp4").replace("dfdc_train_part_5", "faces/videos")
Video(fn)

### ... and the original MP4 file

In [19]:
Video(path)

Saving new TSV files for PyTorchVideo

In [None]:
va["face_fname"] = va.fname.apply(lambda x: x.replace(".mp4", "_face.mp4").replace("dfdc_train_part_5", "faces/videos"))
tr["face_fname"] = tr.fname.apply(lambda x: x.replace(".mp4", "_face.mp4").replace("dfdc_train_part_5", "faces/videos"))

va[["face_fname", "label"]].to_csv("valid_faces.csv", sep="\t", index=False, header=None)
tr[["face_fname", "label"]].to_csv("train_faces.csv", sep="\t", index=False, header=None)