In [12]:
import glob
import json
import re
import os
from os import path


In [13]:
def clean_images(basepath):
    count = 0
    images = glob.glob(f'{basepath}/**/*.jpg')
    for image in images:
        os.remove(image)
        count += 1
    print("Cleaned ", count, " images.")


In [14]:
clean_images("train")
clean_images("test")

Cleaned  720  images.
Cleaned  158  images.


In [15]:
def process_movie(basepath, moviePath, label, split):
    if path.exists(f'{basepath}/{moviePath}'):
        print("Processing movie, ", moviePath, " with label, ", label, " and split, ", split)
        movie = re.sub(r'(\.mp4$)', '', moviePath)
        !ffmpeg -i {basepath}/{moviePath} -r 1/1 {split}/{label}/{movie}%08d.jpg
        return 1
    return 0

In [16]:

metadataFiles = glob.glob('movies/**/metadata.json')
MAX_MOVIES=100

moviesProcessed = 0
testSamplingRate = 4  # every 5th movie, put into test

def processMetadataFile(metadataFile, moviesProcessed):
    numRealMovies = 0
    numFakeMovies = 0
    originalsProcessed = 0

    for metadata in metadataFile:
        print("found metadata, ", metadata)
        basepath=re.sub(r'(\/metadata.json$)', '', metadata)

        with open(metadata) as f:        
            data = json.load(f)

            for key in data:
                if moviesProcessed > MAX_MOVIES:
                    break
                label = data[key]['label']
                if moviesProcessed % testSamplingRate == 0:
                    split = "test"
                else:
                    split = "train"
                if label == "FAKE" and 2 * numFakeMovies > numRealMovies:
                    print("Skipping because we need balanced fake and real videos")
                elif label == "FAKE":
                    original = data[key]['original']
                    processed = process_movie(basepath, key, label, split)
                    moviesProcessed += processed
                    numFakeMovies += processed

                    processed = process_movie(basepath, original, "REAL", split)
                    moviesProcessed += processed
                    numRealMovies += processed
                    originalsProcessed += processed

                else:
                    numRealMovies += 1
                    processed = process_movie(basepath, key, label, split)

                    moviesProcessed += processed
    print("Processed originals", originalsProcessed, " videos for train.")
    return moviesProcessed
      
moviesProcessed += processMetadataFile(metadataFiles, moviesProcessed)
    



found metadata,  movies/dfdc_train_part_0/metadata.json
Processing movie,  htorvhbcae.mp4  with label,  FAKE  and split,  test
ffmpeg version 4.2.1 Copyright (c) 2000-2019 the FFmpeg developers
  built with Apple clang version 11.0.0 (clang-1100.0.33.8)
  configuration: --prefix=/usr/local/Cellar/ffmpeg/4.2.1_2 --enable-shared --enable-pthreads --enable-version3 --enable-avresample --cc=clang --host-cflags='-I/Library/Java/JavaVirtualMachines/adoptopenjdk-13.jdk/Contents/Home/include -I/Library/Java/JavaVirtualMachines/adoptopenjdk-13.jdk/Contents/Home/include/darwin -fno-stack-check' --host-ldflags= --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libbluray --enable-libmp3lame --enable-libopus --enable-librubberband --enable-libsnappy --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libx264 --enable-libx265 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-lib

In [17]:
print("Processed ", moviesProcessed, " videos for train.")

Processed  101  videos for train.


In [18]:
from detect_face import crop_face_save_jpg

In [19]:
trainImages = glob.glob('train/**/*.jpg')


In [20]:
# Find the faces in each image, and crop it out
# If the face can't be detected, then we remove the image from dataset
for image in trainImages:
    crop_face_save_jpg(image)

train/real/qyqufaskjs00000009.jpg  did not have a face
train/real/wfzjxzhdkj00000005.jpg  did not have a face
train/real/wfzjxzhdkj00000011.jpg  did not have a face
train/real/wfzjxzhdkj00000010.jpg  did not have a face
train/real/qyqufaskjs00000008.jpg  did not have a face
train/real/xjzkfqddyk00000012.jpg  did not have a face
train/real/xjzkfqddyk00000006.jpg  did not have a face
train/real/xjzkfqddyk00000010.jpg  did not have a face
train/real/prdrkaxeob00000008.jpg  did not have a face
train/real/fsaronfupy00000008.jpg  did not have a face
train/real/upmgtackuf00000010.jpg  did not have a face
train/real/wfzjxzhdkj00000012.jpg  did not have a face
train/real/wfzjxzhdkj00000006.jpg  did not have a face
train/real/upmgtackuf00000011.jpg  did not have a face
train/real/fsaronfupy00000009.jpg  did not have a face
train/real/prdrkaxeob00000009.jpg  did not have a face
train/real/xjzkfqddyk00000005.jpg  did not have a face
train/real/xjzkfqddyk00000001.jpg  did not have a face
train/real

In [21]:
testImages = glob.glob('test/**/*.jpg')
for image in testImages:
    crop_face_save_jpg(image)

test/real/vtunvalyji00000005.jpg  did not have a face
test/real/vtunvalyji00000011.jpg  did not have a face
test/real/qyqufaskjs00000009.jpg  did not have a face
test/real/qyqufaskjs00000008.jpg  did not have a face
test/real/vtunvalyji00000010.jpg  did not have a face
test/real/vtunvalyji00000004.jpg  did not have a face
test/real/prdrkaxeob00000008.jpg  did not have a face
test/real/vtunvalyji00000012.jpg  did not have a face
test/real/vtunvalyji00000006.jpg  did not have a face
test/real/vtunvalyji00000007.jpg  did not have a face
test/real/prdrkaxeob00000009.jpg  did not have a face
test/real/vtunvalyji00000003.jpg  did not have a face
test/real/vtunvalyji00000002.jpg  did not have a face
test/real/vtunvalyji00000001.jpg  did not have a face
test/real/fdpisghkmd00000009.jpg  did not have a face
test/real/hivnldfvyl00000002.jpg  did not have a face
test/real/hivnldfvyl00000003.jpg  did not have a face
test/real/fdpisghkmd00000008.jpg  did not have a face
test/real/hivnldfvyl00000007

In [22]:
print("Finished preprocessing movies")

Finished preprocessing movies


In [6]:
import numpy as np
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.applications.vgg16 import decode_predictions
from keras.applications.vgg16 import VGG16
from keras.models import Model
from pickle import dump

def extract_features(filepath):
    # load model
    model = VGG16()
    # remove the output layer
    model.layers.pop()
    model = Model(inputs=model.inputs, outputs=model.layers[-1].output)

    features = np.zeros((1, 4096))
    filepath = os.path.join(filepath,'*.jpg')
    faces = glob.glob(filepath)
    for face in faces:
        # load an image from file
        image = load_img(face, target_size=(224, 224))
        # convert the image pixels to a numpy array
        image = img_to_array(image)
        # reshape data for the model
        image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
        #print(image.shape)
        #prepare the image for the VGG model
        image = preprocess_input(image)
        # get extracted features
        features = np.append(features,  model.predict(image), axis = 0)
        #print(features.shape)
        # save to file
        #dump(features, f)
    #save to pd dataframe
    return pd.DataFrame(features)
train_fake_df = extract_features('train/fake/')
print(train_fake_df)

     0         1     2     3         4         5     6         7         8     \
0     0.0  0.000000   0.0   0.0  0.000000  0.000000   0.0  0.000000  0.000000   
1     0.0  0.000000   0.0   0.0  1.375318  0.000000   0.0  0.981021  1.805275   
2     0.0  0.000000   0.0   0.0  0.298203  0.000000   0.0  1.298862  1.997893   
3     0.0  0.000000   0.0   0.0  0.000000  0.791421   0.0  0.000000  1.786620   
4     0.0  0.092738   0.0   0.0  0.257447  1.187826   0.0  0.000000  1.344750   
..    ...       ...   ...   ...       ...       ...   ...       ...       ...   
296   0.0  0.000000   0.0   0.0  0.264335  0.031874   0.0  0.000000  2.370015   
297   0.0  0.000000   0.0   0.0  0.896690  0.000000   0.0  0.000000  2.777396   
298   0.0  0.000000   0.0   0.0  0.651859  0.000000   0.0  0.000000  2.335115   
299   0.0  0.000000   0.0   0.0  1.302316  0.758083   0.0  0.000000  0.803667   
300   0.0  0.000000   0.0   0.0  0.389866  0.000000   0.0  0.000000  1.168435   

         9     ...     4086