In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
%matplotlib inline 
import cv2

## Data Exploration

In [None]:
DATA_FOLDER = '../input/deepfake-detection-challenge'
TRAIN_SAMPLE_FOLDER = 'train_part_48'

print(f"Train samples: {len(os.listdir(os.path.join(DATA_FOLDER, TRAIN_SAMPLE_FOLDER)))}")

In [None]:
FACE_DETECTION_FOLDER = '../input/haar-cascades-for-face-detection'
print(f"Face detection resources: {os.listdir(FACE_DETECTION_FOLDER)}")

In [None]:
train_list = list(os.listdir(os.path.join(DATA_FOLDER, TRAIN_SAMPLE_FOLDER)))
ext_dict = []
for file in train_list:
    file_ext = file.split('.')[1]
    if (file_ext not in ext_dict):
        ext_dict.append(file_ext)
print(f"Extensions: {ext_dict}") 

In [None]:
json_file = [file for file in train_list if  file.endswith('json')][0]
print(f"JSON file: {json_file}")

In [None]:
def get_meta_from_json(path):
    df = pd.read_json(os.path.join(DATA_FOLDER, path, json_file))
    df = df.T
    return df

meta_train_df = get_meta_from_json(TRAIN_SAMPLE_FOLDER)
meta_train_df.head()

In [None]:
def plot_count(feature, title, df, size=1):
    '''
    Plot count of classes / feature
    param: feature - the feature to analyze
    param: title - title to add to the graph
    param: df - dataframe from which we plot feature's classes distribution 
    param: size - default 1.
    '''
    f, ax = plt.subplots(1,1, figsize=(4*size,4))
    total = float(len(df))
    g = sns.countplot(df[feature], order = df[feature].value_counts().index[:20], palette='Set3')
    g.set_title("Number and percentage of {}".format(title))
    if(size > 2):
        plt.xticks(rotation=90, size=8)
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x()+p.get_width()/2.,
                height + 3,
                '{:1.2f}%'.format(100*height/total),
                ha="center") 
    plt.show()    

In [None]:
plot_count('label', 'label (train)', meta_train_df)

## Samples

In [None]:
fake_train_sample_video = list(meta_train_df.loc[meta_train_df.label=='FAKE'].sample(3).index)
fake_train_sample_video

In [None]:
def display_image_from_video(video_path):
    '''
    input: video_path - path for video
    process:
    1. perform a video capture from the video
    2. read the image
    3. display the image
    '''
    capture_image = cv2.VideoCapture(video_path) 
    ret, frame = capture_image.read()
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(111)
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    ax.imshow(frame)

In [None]:
for video_file in fake_train_sample_video:
    display_image_from_video(os.path.join(DATA_FOLDER, TRAIN_SAMPLE_FOLDER, video_file))

In [None]:
fake_videos = list(meta_train_df.loc[meta_train_df.label=='FAKE'].index)

In [None]:
from IPython.display import HTML
from base64 import b64encode

def play_video(video_file, subset=TRAIN_SAMPLE_FOLDER):
    '''
    Display video
    param: video_file - the name of the video file to display
    param: subset - the folder where the video file is located (can be TRAIN_SAMPLE_FOLDER or TEST_Folder)
    '''
    video_url = open(os.path.join(DATA_FOLDER, subset,video_file),'rb').read()
    data_url = "data:video/mp4;base64," + b64encode(video_url).decode()
    return HTML("""<video width=500 controls><source src="%s" type="video/mp4"></video>""" % data_url)

In [None]:
play_video(fake_videos[0])

## CNN

### ROI

In [None]:
#Frontal face, profile, eye and smile  haar cascade loaded
frontal_cascade_path= os.path.join(FACE_DETECTION_FOLDER,'haarcascade_frontalface_default.xml')

face_cascade = cv2.CascadeClassifier(frontal_cascade_path)

In [None]:
def ROI(img):
    
    face_img = img.copy()
  
    face_rects = face_cascade.detectMultiScale(face_img,scaleFactor=1.3, minNeighbors=5) 
    
    for (x,y,w,h) in face_rects: 
        roi = face_img[y:y+256,x:x+256] 
        
    try:
        return roi
    except:
        return []

## Model building

In [None]:
data = []

In [None]:
for n,v in enumerate(list(meta_train_df.index)[:2000]):
    print(n,v)

    cap = cv2.VideoCapture(os.path.join(DATA_FOLDER, TRAIN_SAMPLE_FOLDER,v))
    ret, frame = cap.read()
    if not ret: roi = []
    else: roi = ROI(frame)

    if len(roi) < 1:
        count = 1
        while len(roi) < 1:
            cap.set(cv2.CAP_PROP_POS_MSEC,(count*1000))
            ret,frame = cap.read()
            if ret: roi = ROI(frame)
            count+=1
            if count >= 10:
                break

    data.append(roi)

In [None]:
for i in range(len(data)):
    if data[i] == []:
        data[i] = np.zeros((256,256,3))
    print(data[i].shape)
    if data[i].shape != (256,256,3):
        data[i] = np.resize(data[i],(256,256,3))

In [None]:
label_list = list(meta_train_df.label)

In [None]:
data = data[1000:]

In [None]:
len(data)

In [None]:
train_images, train_labels = np.asarray(data[:1600]), np.asarray(label_list[:1600])
test_images, test_labels = np.asarray(data[1600:]), np.asarray(label_list[1600:2000])

In [None]:
print(train_images.shape,train_labels.shape, test_images.shape)

In [None]:
train_images, test_images = train_images / 255.0, test_images / 255.0

In [None]:
plt.imshow(train_images[0])

### Model Training

In [None]:
import tensorflow as tf

from tensorflow.keras import datasets, layers, models, metrics
import matplotlib.pyplot as plt

In [None]:
output_bias = tf.keras.initializers.Constant(np.log([0.2]))

In [None]:
model2 = models.Sequential()



model2.add(layers.Conv2D(8, (3, 3), activation='relu', input_shape=(256,256,3)))
model2.add(layers.BatchNormalization())
model2.add(layers.MaxPooling2D((2, 2)))
model2.add(layers.Conv2D(8, (5, 5), activation='relu'))
model2.add(layers.BatchNormalization())
model2.add(layers.MaxPooling2D((2, 2)))
model2.add(layers.Conv2D(16, (5, 5), activation='relu'))
model2.add(layers.BatchNormalization())
model2.add(layers.MaxPooling2D((2, 2)))
model2.add(layers.Conv2D(16, (5, 5), activation='relu'))
model2.add(layers.BatchNormalization())
model2.add(layers.MaxPooling2D((4, 4)))

model2.add(layers.Flatten())
model2.add(layers.Dropout(0.5))
model2.add(layers.Dense(16, activation='relu'))
model2.add(layers.Dropout(0.5))
model2.add(layers.Dense(1, activation='sigmoid',bias_initializer=output_bias))

model2.summary()

In [None]:
train_labels[train_labels == 'FAKE'] = 0
train_labels[train_labels == 'REAL'] = 1

train_labels = train_labels.astype('float64')

In [None]:
model2.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy',])

In [None]:
history = model2.fit(train_images, train_labels, epochs=10,
                    validation_data=(test_images, test_labels))

In [None]:
plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label = 'val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.ylim([0.8, 0.9])
plt.legend(loc='lower right')

In [None]:
data2 = []

In [None]:
for n,v in enumerate(list(meta_train_df.index)[1000:1200]):
    print(n,v)

    cap = cv2.VideoCapture(os.path.join(DATA_FOLDER, TRAIN_SAMPLE_FOLDER,v))
    ret, frame = cap.read()
    if not ret: roi = []
    else: roi = ROI(frame)

    if len(roi) < 1:
        count = 1
        while len(roi) < 1:
            cap.set(cv2.CAP_PROP_POS_MSEC,(count*1000))
            ret,frame = cap.read()
            if ret: roi = ROI(frame)
            count+=1
            if count >= 10:
                break

    data2.append(roi)

In [None]:
for i in range(len(data2)):
    if data2[i] == []:
        data2[i] = np.zeros((256,256,3))
    print(data2[i].shape)
    if data2[i].shape != (256,256,3):
        data2[i] = np.resize(data2[i],(256,256,3))

In [None]:
test_images, test_labels = np.asarray(data2), np.asarray(label_list[1000:1200])

In [None]:
test_images = test_images / 255.0

In [None]:
test_labels[test_labels == 'FAKE'] = 0
test_labels[test_labels == 'REAL'] = 1

test_labels = test_labels.astype('float64')

In [None]:
test_loss, test_acc = model2.evaluate(test_images,  test_labels, verbose=2)

In [None]:
model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(256,256,3)))
#model.add(layers.Conv2D(32, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
#model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
#model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(256, (3, 3), activation='relu'))
#model.add(layers.Conv2D(256, (3, 3), activation='relu'))
#model.add(layers.Conv2D(256, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(256, (3, 3), activation='relu'))
model.add(layers.Conv2D(256, (3, 3), activation='relu'))
# model.add(layers.Conv2D(256, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))

model.add(layers.Flatten())
#model.add(layers.Dropout(0.5))
model.add(layers.Dense(64, activation='relu'))
#model.add(layers.Dropout(0.5))
model.add(layers.Dense(1, activation='sigmoid'))

model.summary()

In [None]:
class_weight = {0: 1.,
                1: 15.}
METRICS = [
      metrics.TruePositives(name='tp'),
      metrics.FalsePositives(name='fp'),
      metrics.TrueNegatives(name='tn'),
      metrics.FalseNegatives(name='fn'), 
      metrics.BinaryAccuracy(name='accuracy'),
      metrics.Precision(name='precision'),
      metrics.Recall(name='recall'),
      metrics.AUC(name='auc'),
]
model2.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=METRICS)
history3 = model2.fit(train_images, train_labels, epochs=10,
                    validation_data=(test_images, test_labels),class_weight=class_weight)

In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=METRICS)
history5 = model.fit(train_images, train_labels, epochs=3,
                    validation_data=(test_images, test_labels))

In [None]:
class_weight = {0: 1.,
                1: 1.}
history6 = model2.fit(train_images, train_labels, epochs=10,
                    validation_data=(test_images, test_labels),class_weight=class_weight)

In [None]:
class_weight = {0: 1.,
                1: 5.}
history7 = model2.fit(train_images, train_labels, epochs=10,
                    validation_data=(test_images, test_labels),class_weight=class_weight)

In [None]:
class_weight = {0: 1.,
                1: 10.}
history8 = model2.fit(train_images, train_labels, epochs=10,
                    validation_data=(test_images, test_labels),class_weight=class_weight)

In [None]:
tf.keras.backend.clear_session()

In [None]:
class_weight = {0: 1.,
                1: 25.}
history9 = model2.fit(train_images, train_labels, epochs=10,
                    validation_data=(test_images, test_labels),class_weight=class_weight)

In [None]:
model_history = []

model_history.append(pd.DataFrame.from_dict(history6.history))
model_history.append(pd.DataFrame.from_dict(history7.history))
model_history.append(pd.DataFrame.from_dict(history8.history))
model_history.append(pd.DataFrame.from_dict(history3.history))

In [None]:
model_history[0][['loss','auc','accuracy','val_loss','val_auc','val_accuracy']]

In [None]:
model_history[0][['val_tp','val_fp','val_tn','val_fn']]

In [None]:
model_history[-1].mean(axis=0)['auc']

In [None]:
def ave_auc (history_list):
    result = []
    for history in history_list:
        result.append(history.mean(axis=0)['auc'])
    return result

In [None]:
model_history.append(pd.DataFrame.from_dict(history9.history))

In [None]:
plt.bar([1,2,3,4,5],ave_auc(model_history),tick_label=['1:1','1:5','1:10','1:15','1:25'])
plt.ylim([0.4,0.54])
plt.ylabel('AUC')
plt.xlabel('class weight ratio')