# Importing Libraries and bringing data in

In [1]:
import os
import pandas as pd
import cv2 as cv
import numpy as np
import tensorflow as tf
from tqdm import tqdm
from mtcnn import MTCNN
from sklearn.model_selection import train_test_split
from tensorflow import keras

base_dir = "mini-face-forensics/"

  _warn(("h5py is running against HDF5 {0} when it was built against {1}, "


In [2]:
labels = os.listdir(base_dir)
print (labels)

['fake', 'real']


In [3]:
detector = MTCNN()

In [4]:
video_paths = []
labels = []  
for sub_folder in os.listdir(base_dir):
    label = sub_folder
    sub_folder = os.path.join(base_dir, sub_folder)
    for vid in os.listdir(sub_folder):
        video_paths.append(os.path.join(sub_folder, vid))
        labels.append(label)

In [5]:
print("First 10 video paths:", video_paths[:10])
print("Last 10 video paths:", video_paths[-10:])
print("First 10 labels:", labels[:10])
print("Last 10 labels:", labels[-10:])

First 10 video paths: ['mini-face-forensics/fake\\01_02__outside_talking_still_laughing__YVGY8LOK.mp4', 'mini-face-forensics/fake\\01_02__walk_down_hall_angry__YVGY8LOK.mp4', 'mini-face-forensics/fake\\01_03__hugging_happy__ISF9SP4G.mp4', 'mini-face-forensics/fake\\01_03__podium_speech_happy__480LQD1C.mp4', 'mini-face-forensics/fake\\01_03__talking_against_wall__JZUXXFRB.mp4', 'mini-face-forensics/fake\\01_11__meeting_serious__9OM3VE0Y.mp4', 'mini-face-forensics/fake\\01_11__secret_conversation__4OJNJLOO.mp4', 'mini-face-forensics/fake\\01_11__talking_against_wall__9229VVZ3.mp4', 'mini-face-forensics/fake\\01_11__walking_outside_cafe_disgusted__FAFWDR4W.mp4', 'mini-face-forensics/fake\\01_12__outside_talking_pan_laughing__TNI7KUZ6.mp4']
Last 10 video paths: ['mini-face-forensics/real\\15__outside_talking_still_laughing.mp4', 'mini-face-forensics/real\\15__podium_speech_happy.mp4', 'mini-face-forensics/real\\15__talking_against_wall.mp4', 'mini-face-forensics/real\\15__talking_angry_cou

In [6]:
df = pd.DataFrame({'video_path': video_paths, 'label': labels})
print(df.head())
print(df.tail())

                                          video_path label
0  mini-face-forensics/fake\01_02__outside_talkin...  fake
1  mini-face-forensics/fake\01_02__walk_down_hall...  fake
2  mini-face-forensics/fake\01_03__hugging_happy_...  fake
3  mini-face-forensics/fake\01_03__podium_speech_...  fake
4  mini-face-forensics/fake\01_03__talking_agains...  fake
                                            video_path label
395  mini-face-forensics/real\15__walking_down_indo...  real
396  mini-face-forensics/real\15__walking_down_stre...  real
397  mini-face-forensics/real\15__walking_outside_c...  real
398  mini-face-forensics/real\15__walk_down_hall_an...  real
399   mini-face-forensics/real\16__exit_phone_room.mp4  real


In [7]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

In [8]:
print(f"Total videos for training: {len(train_df)}")
print(f"Total videos for testing: {len(test_df)}")

Total videos for training: 320
Total videos for testing: 80


In [9]:
def get_center(frame):
    y, x = frame.shape[0:2]
    min_dim = min(y, x)
    start_x = (x//2)-(min_dim//2)
    start_y = (y//2)-(min_dim//2)
    return frame[start_y:start_y+min_dim, start_x:start_x+min_dim]

In [10]:
# return frame and face bounding box for first frame, and then reusing same box to save computation of detector
def get_face_region_for_first_frame(frame, previous_box=None):
    if previous_box is None:
        # Detect the face only if no previous bounding box is provided
        detections = detector.detect_faces(frame)
        if detections:
            x, y, w, h = detections[0]['box']
            previous_box = (x, y, w, h)
        else:
            return get_center(frame), None  # fallback to center crop if no face detected
    else:
        x, y, w, h = previous_box

    face_region = frame[y:y+h, x:x+w]
    return face_region, previous_box

In [11]:
IMG_SIZE = 224
def load_video(path, max_frames=0, resize=(IMG_SIZE, IMG_SIZE), skip_frames=2):
    cap = cv.VideoCapture(path)
    frames = []
    frame_count = 0
    previous_box = None  # Store the bounding box from first frame
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            # Process frames as per skip rate to avoid getting near same frames
            if frame_count % skip_frames == 0:
                frame, previous_box = get_face_region_for_first_frame(frame, previous_box)
                frame = cv.resize(frame, resize)
                frame = frame[:, :, [2, 1, 0]]  # BGR to RGB
                frames.append(frame)
                if len(frames) == max_frames:
                    break
            frame_count += 1
        # Pad with the last frame if we have fewer frames than max_frames
        while len(frames) < max_frames and frames:
            frames.append(frames[-1])
    finally:
        cap.release()
    return np.array(frames)

# Building Feature extractor

In [12]:
def build_feature_extractor():
    feature_extractor = keras.applications.InceptionV3(
        weights="imagenet",
        include_top=False,
        pooling="avg",
        input_shape=(IMG_SIZE, IMG_SIZE, 3),)
    preprocess_input = keras.applications.inception_v3.preprocess_input
    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
    preprocessed = preprocess_input(inputs)
    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="feature_extractor")

feature_extractor = build_feature_extractor()

In [13]:
labeler = keras.layers.StringLookup(num_oov_indices=0, vocabulary=np.unique(train_df["label"]))
print(labeler.get_vocabulary())

['fake', 'real']


# Extracting frames

In [14]:
#hyperparameters
IMG_SIZE = 224
BATCH_SIZE = 64
MAX_SEQ_LENGTH = 20
NUM_FEATURES = 2048

In [15]:
def prepare_all_videos(df):
    num_samples = len(df)
    video_paths = df["video_path"].values.tolist()
    labels = df["label"].values
    #convert classlabels to integer format(one-hot)
    labels = labeler(labels[..., None]).numpy()
    # `frame_masks` and `frame_features` are what we will feed to our sequence model.
    # `frame_masks` will contain a bunch of booleans denoting if a timestep is masked with padding or not.
    frame_masks = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH), dtype="bool") # 320,20
    frame_features = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32") #320,20,2048
    # For each video.
    for idx, path in enumerate(tqdm(video_paths, desc="Extracting features")):
        # Gather all its frames and add a batch dimension.
        frames = load_video(path)
        frames = frames[None, ...]
        # Initialize placeholders to store the masks and features of the current video.
        temp_frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
        temp_frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")
        # Extract features from the frames of the current video.
        for i, batch in enumerate(frames):
            video_length = batch.shape[0]
            length = min(MAX_SEQ_LENGTH, video_length)
            for j in range(length):
                temp_frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :], verbose=0)
            temp_frame_mask[i, :length] = 1  # 1=not masked, 0=masked

        frame_features[idx,] = temp_frame_features.squeeze()
        frame_masks[idx,] = temp_frame_mask.squeeze()

    return (frame_features, frame_masks), labels

train_data, train_labels = prepare_all_videos(train_df)
test_data, test_labels = prepare_all_videos(test_df)
print(f"Frame features in train set: {train_data[0].shape}")
print(f"Frame masks in train set: {train_data[1].shape}")
print(f"train_labels in train set: {train_labels.shape}")
print(f"test_labels in train set: {test_labels.shape}")

Extracting features: 100%|██████████| 320/320 [47:50<00:00,  8.97s/it]  
Extracting features: 100%|██████████| 80/80 [10:31<00:00,  7.89s/it]

Frame features in train set: (320, 20, 2048)
Frame masks in train set: (320, 20)
train_labels in train set: (320, 1)
test_labels in train set: (80, 1)





# Creating LSTM

In [19]:
def build_LSTM_model():
    class_vocab = labeler.get_vocabulary()
    frame_features_input = keras.Input((MAX_SEQ_LENGTH, NUM_FEATURES))
    mask_input = keras.Input((MAX_SEQ_LENGTH,), dtype="bool")
    # to learn about masking, see keras api documentation
    x = keras.layers.LSTM(16, return_sequences=True)(frame_features_input, mask=mask_input)
    x = keras.layers.LSTM(8)(x)
    x = keras.layers.Dropout(0.4)(x)
    x = keras.layers.Dense(8, activation="relu")(x)
    output = keras.layers.Dense(len(class_vocab), activation="softmax")(x)
    lstm_model = keras.Model([frame_features_input, mask_input], output)
    lstm_model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    return lstm_model

def run_experiment():
    filepath = "deepfake_detector_model.keras"
    checkpoint = keras.callbacks.ModelCheckpoint(filepath, save_best_only=True, verbose=1)
    lstm_model = build_LSTM_model()
    history = lstm_model.fit([train_data[0], train_data[1]],train_labels,validation_split=0.3,epochs=30,callbacks=[checkpoint],)

    lstm_model=keras.models.load_model(filepath)
    _, accuracy = lstm_model.evaluate([test_data[0], test_data[1]], test_labels)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")
    return history, lstm_model

_, lstm_model = run_experiment()

Epoch 1/30




[1m5/7[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m0s[0m 13ms/step - accuracy: 0.4301 - loss: 0.7251
Epoch 1: val_loss improved from inf to 0.70529, saving model to deepfake_detector_model.keras
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 77ms/step - accuracy: 0.4561 - loss: 0.7148 - val_accuracy: 0.4792 - val_loss: 0.7053
Epoch 2/30
[1m6/7[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 11ms/step - accuracy: 0.4960 - loss: 0.7356
Epoch 2: val_loss improved from 0.70529 to 0.68493, saving model to deepfake_detector_model.keras
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.4981 - loss: 0.7290 - val_accuracy: 0.5000 - val_loss: 0.6849
Epoch 3/30
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.5584 - loss: 0.6733
Epoch 3: val_loss improved from 0.68493 to 0.67015, saving model to deepfake_detector_model.keras
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - 



[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8211 - loss: 0.4340  
Test accuracy: 83.75%


In [21]:
def prepare_single_video(frames):
    frames = frames[None, ...]
    frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
    frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")
    for i, batch in enumerate(frames):
        video_length = batch.shape[0]
        length = min(MAX_SEQ_LENGTH, video_length)
        for j in range(length):
            frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :],verbose=0)
        frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

    return frame_features, frame_mask


def sequence_prediction(path):
    class_vocab = labeler.get_vocabulary()
    frames = load_video(path)
    frame_features, frame_mask = prepare_single_video(frames)
    probabilities = lstm_model.predict([frame_features, frame_mask])[0]
    for i in np.argsort(probabilities)[::-1]:
        print(f"  {class_vocab[i]}: {probabilities[i] * 100:5.2f}%")
    return frames

test_video = np.random.choice(test_df["video_path"].values.tolist())
print(f"Test video path: {test_video}")
test_frames = sequence_prediction(test_video)


Test video path: mini-face-forensics/real\04__talking_against_wall.mp4
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
  real: 86.61%
  fake: 13.39%


In [22]:
video="pk_screening_aamir_khan_deepfake.mp4"
sampled_frames=sequence_prediction(video)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
  fake: 85.68%
  real: 14.32%
