# Dataset construction

This notebook is designed to construct reproducible dataset used to train models on various two main tasks:
- **Real/Fake Image Classification**
- **Real/Fake Video Classification**


In [None]:
# import kagglehub

# # Download latest version
# path = kagglehub.dataset_download("adham7elmy/faceforencispp-extracted-frames")

# print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/adham7elmy/faceforencispp-extracted-frames?dataset_version_number=4...


100%|██████████| 13.4G/13.4G [10:52<00:00, 22.1MB/s]  

Extracting files...





Path to dataset files: /Users/mac/.cache/kagglehub/datasets/adham7elmy/faceforencispp-extracted-frames/versions/4


If you have already downloaded the dataset, change the `path` variable to correct directory

In [1]:
path = "path_to_your_dataset"

In [3]:
import pandas as pd
from PIL import Image
import os
import shutil
import random

## 1. Construct dataset for Real/Fake Image Classification

In [4]:
real_path = os.path.join(path, "real")
video_ids = sorted({f for f in os.listdir(real_path)})

In [5]:
total = len(video_ids)
train_size = int(0.7 * total)
val_size = int(0.15 * total)
train_vids = video_ids[:train_size]  # 70%
val_vids = video_ids[train_size:train_size+val_size]
test_vids = video_ids[train_size + val_size:]
print("Total: ", total)
print(f"Number of train vids: {len(train_vids)}")
print(f"Number of val vids: {len(val_vids)}")
print(f"Number of test vids: {len(test_vids)}")

Total:  999
Number of train vids: 699
Number of val vids: 149
Number of test vids: 151


In [6]:
def copy_n_frames_from_vids(
    vid_list, src_root, dst_dir, frames_per_vid=15, label="real"
):
    rng = random.Random(42)
    os.makedirs(dst_dir, exist_ok=True)

    total_frames_copied = 0
    for vid in vid_list:
        vid_dir = os.path.join(src_root, vid)
        frames = sorted(os.listdir(vid_dir))
        frames_shuffle = frames.copy()
        rng.shuffle(frames_shuffle)

        # Get 15 frames per vid
        frames_copied = 0
        for f in frames_shuffle:
            if frames_copied >= frames_per_vid:
                break
            frame_num = f.split(".")[0]
            shutil.copy(
                os.path.join(vid_dir, f),
                os.path.join(dst_dir, f"real_{vid}_{frame_num}.png"),
            )
            frames_copied += 1
            total_frames_copied += 1
    print(f"Copied {total_frames_copied} {label} images")


# Real images
copy_n_frames_from_vids(train_vids, real_path, "data/train/real", frames_per_vid=15)
copy_n_frames_from_vids(val_vids, real_path, "data/val/real", frames_per_vid=10)
copy_n_frames_from_vids(test_vids, real_path, "data/test/real", frames_per_vid=10)

# Fake images — we sample across all manipulation methods for better generalization
fake_methods = ["Deepfakes", "Face2Face", "FaceSwap", "FaceShifter", "NeuralTextures"]


def copy_fake_frames(vid_list, frames_per_vid, split):
    rng = random.Random(42)
    dst = f"data/{split}/fake"
    os.makedirs(dst, exist_ok=True)
    copied = 0
    for method in fake_methods:
        method_dir = f"{path}/fake/{method}"
        if not os.path.exists(method_dir):
            continue
        for vid in sorted(os.listdir(method_dir)):
            vid_dir = os.path.join(method_dir, vid)
            if not os.path.isdir(vid_dir):
                continue
            vid_id = vid[:3]
            if vid_id in vid_list:
                # Only use fakes where target video is in the correct split
                target_frames = []
                for f in sorted(os.listdir(vid_dir)):
                    if f.endswith("png"):
                        target_frames.append(f)
                rng.shuffle(target_frames)
                frames_copied = 0
                for f in target_frames:
                    if frames_copied >= frames_per_vid:
                        break
                    src = os.path.join(vid_dir, f)
                    frame_num = f.split(".")[0]
                    dst_name = f"fake_{vid_id}_{method}_{frame_num}.png"  # Name convention: vid_id_technique_order
                    shutil.copy(src, os.path.join(dst, dst_name))
                    copied += 1
                    frames_copied += 1
    print(f"{split}/fake: {copied}")


copy_fake_frames(train_vids, frames_per_vid=3, split="train")
copy_fake_frames(val_vids, frames_per_vid=2, split="val")
copy_fake_frames(test_vids, frames_per_vid=2, split="test")

Copied 10485 real images
Copied 1490 real images
Copied 1510 real images
train/fake: 10484
val/fake: 1490
test/fake: 1510
