# Dataset construction

This notebook is designed to construct reproducible dataset used to train models on various two main tasks:
- **Real/Fake Image Classification**
- **Real/Fake Video Classification**


In [None]:
# import kagglehub

# # Download latest version
# path = kagglehub.dataset_download("adham7elmy/faceforencispp-extracted-frames")

# print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/adham7elmy/faceforencispp-extracted-frames?dataset_version_number=4...


100%|██████████| 13.4G/13.4G [10:52<00:00, 22.1MB/s]  

Extracting files...





Path to dataset files: /Users/mac/.cache/kagglehub/datasets/adham7elmy/faceforencispp-extracted-frames/versions/4


If you have already downloaded the dataset, change the `path` variable to correct directory

In [None]:
path = "" # Your path here

In [1]:
import pandas as pd
from PIL import Image
import os
import shutil

In [2]:
import random
def set_seed(seed=42):
    random.seed(seed)
set_seed(42)

## 1. Construct dataset for Real/Fake Image Classification

In [7]:
real_path = os.path.join(path, "real")
video_ids = sorted({f for f in os.listdir(real_path)})

In [10]:
total = len(video_ids)
train_size = int(0.8 * total)
train_vids = video_ids[:train_size]  # 80%
val_vids = video_ids[train_size:]  # 20%

In [11]:
def copy_n_frames_from_vids(vid_list, src_root, dst_dir, frames_per_vid = 15, label="real"):
    os.makedirs(dst_dir, exist_ok=True)
    random.shuffle(vid_list)

    total_frames_copied = 0
    for vid in vid_list:
        vid_dir = os.path.join(src_root, vid)
        frames = os.listdir(vid_dir)
        random.shuffle(frames)

        # Get 15 frames per vid
        frames_copied = 0
        for f in frames:
            if frames_copied >= frames_per_vid:
                break
            shutil.copy(
                os.path.join(vid_dir, f),
                os.path.join(dst_dir, f"real_{total_frames_copied:06d}.png"),
            )
            frames_copied += 1
            total_frames_copied += 1
    print(f"Copied {total_frames_copied} {label} images")


# Real images
copy_n_frames_from_vids(train_vids, real_path, "data/images/train/real", frames_per_vid=15)
copy_n_frames_from_vids(val_vids, real_path, "data/images/val/real", frames_per_vid=10)

# Fake images — we sample across all manipulation methods for better generalization
fake_methods = ["Deepfakes", "Face2Face", "FaceSwap", "FaceShifter", "NeuralTextures"]
def copy_fake_frames(vid_list, frames_per_vid, split):

    dst = f"data/images/{split}/fake"
    os.makedirs(dst, exist_ok=True)
    copied = 0
    for method in fake_methods:  
        method_dir = f"{path}/fake/{method}"
        if not os.path.exists(method_dir):
            continue
        for vid in os.listdir(method_dir):
            vid_dir = os.path.join(method_dir, vid)
            if not os.path.isdir(vid_dir): continue
            vid_id = vid[:3]
            if vid_id in vid_list:
                # Only use fakes where target video is in the correct split
                target_frames = []
                for f in os.listdir(vid_dir):
                    if f.endswith("png"):
                        target_frames.append(f)
                random.shuffle(target_frames)
                frames_copied = 0
                for f in target_frames:
                    if frames_copied >= frames_per_vid:
                        break
                    src = os.path.join(vid_dir, f)
                    dst_name = f"fake_{copied:06d}.png"
                    shutil.copy(src, os.path.join(dst, dst_name))
                    copied += 1
                    frames_copied += 1
    print(f"{split}/fake: {copied}")

copy_fake_frames(train_vids, frames_per_vid=3, split='train')
copy_fake_frames(val_vids, frames_per_vid=2, split="val")

Copied 11985 real images
Copied 2000 real images
train/fake: 11984
val/fake: 2000


## 2. Construct video dataset for CNN/LSTM training