# Final Project 

**Title**: Deep Fake Detection

**Contributors**: Adam Haile, Alhagie Boye, Rudolph Evonich

**Onjective**: To develop a robust CNN model capable of accurately classifying video frames as real or fake.


This is a demo of our CNN model that will be use to classify videos as real or fake.


In [None]:
import os
import json
import glob
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow
from tqdm import tqdm
import concurrent.futures
import tensorflow.keras as keras
import frame_extractor as extractor
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

# Data Loading & Data Preprocessing
The following cells handles the loading of video data and associated metadata and data preprocessing.

Video files and their corresponding metadata are retrieved from a designated directory (train_sample_videos/) and a JSON file (metadata.json).

A custom module named frame_extractor is utilized to extract frame from each video for further processing.

The orientation of the frames is checked, and those with vertical orientation are rotated to ensure consistency.

Based on the metadata information, binary labels are assigned to each frame: 1 for "FAKE" videos and 0 for "REAL" videos.

In [None]:
def rotate_vertical(x):
    if x.shape == (1920,1080,3):
        return np.rot90(x)
    else:
        return x

In [None]:
train_X = []
train_Y = []
parent = os.path.dirname(os.getcwd())
videos = glob.glob(parent + "/train_sample_videos/*.mp4")
f = open(parent + "/train_sample_videos/metadata.json")
valid = json.load(f)

videos_processed = 0

with concurrent.futures.ThreadPoolExecutor(max_workers=len(videos)) as executor:
    futures = {executor.submit(extractor.run_extraction, directory): directory for directory in videos}
    concurrent.futures.wait(futures)
    
    for future in concurrent.futures.as_completed(futures):
        directory = futures[future]
        video_name = directory.split("/")[-1]
        try:
            result = future.result()
            for frame in result:
                train_X.append(rotate_vertical(frame))
                train_Y.append(np.array([1,0]) if valid[video_name]["label"] == "FAKE" else np.array([0,1]))
            videos_processed += 1
            print (f"Videos Finished: {videos_processed} / {len(videos)}", end='\r')
        except Exception as e:
            print(f"Error processing directory {directory}: {e}")
            
train_X = np.array(train_X)
train_Y = np.array(train_Y)

# Model Creation and Training
This cell creates the cnn model to be trained. (Currently a basic CNN example model, not our finished model)

This serves as a basic example and comprises a convolutional layer, max-pooling, flattening, and two dense layers.

The dataset is split into training and validation sets to assess the model's performance during training.

The model is trained on the training set for a specified number of epochs and batch size.

The training and validation accuracy are visualized over epochs using matplotlib to track the model's learning progress.


In [None]:
cnn_epochs = 10
cnn_batch_size = 5
train_samples = len(train_X)
input_shape =  (1080,1920,3) #TODO: Set input_shape to the shape of our input
num_classes = 2

steps_per_epoch = train_samples/cnn_batch_size

# Split the data 
train_X, validation_X, train_Y, validation_Y = train_test_split(train_X, train_Y, test_size=0.2, random_state=42)

cnn_model = tensorflow.keras.models.Sequential()
cnn_model.add(Conv2D(16, kernel_size=(3, 3), activation='relu', input_shape=input_shape))
cnn_model.add(MaxPooling2D(pool_size=(2, 2)))
cnn_model.add(Flatten())
cnn_model.add(Dense(16, activation='relu'))  # Reduce the number of neurons in this layer
cnn_model.add(Dense(num_classes, activation='softmax'))
cnn_model.compile(loss="binary_crossentropy",
              optimizer=keras.optimizers.SGD(),
              metrics=['accuracy'])                     
cnn_model.summary()

history = cnn_model.fit(train_X, train_Y, epochs=cnn_epochs, batch_size=cnn_batch_size, validation_data=(validation_X, validation_Y))

# Plot for the experiment
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# The code elow evaluates the trained CNN model's performance on the validation set.
# The model's ability to distinguish between real and fake video frames is assessed using the validation data.
# The validation accuracy is calculated and displayed, providing an indication of the model's generalizability
evaluation = cnn_model.evaluate(validation_X, validation_Y)
print(f"Validation Accuracy: {evaluation[1] * 100:.2f}%")