In [1]:
import os
import shutil
import random

In [5]:
DS_CDFV1 = 'celeb_df_v1/'
DS_CDFV2 = 'celeb_df_v2/'

DS_ORGINAL = 'dataset_original/'
DS_SPLIT = 'dataset_split/'
DS_IFRAMES = 'dataset_iframes/'
DS_FACE = 'dataset_face/'
DS_FACE_IMG = 'dataset_face_img/'
DS_SEGMENTS = 'dataset_segments/'
DS_RAW = 'dataset_raw/'
DS_RESIDUALS = 'dataset_residuals/'

SEG_1 = 'seg_1/'
SEG_2 = 'seg_2/'
SEG_3 = 'seg_3/'
SEG_4 = 'seg_4/'
SEG_5 = 'seg_5/'

DS_TRAIN = 'train_dataset/'
DS_TEST = 'test_dataset/'
DS_VAL = 'val_dataset/'

CLASS_FAKE = 'fake/'
CLASS_REAL = 'real/'


TOP_LEVEL_1 = [DS_SPLIT, DS_IFRAMES, DS_FACE, DS_FACE_IMG]
TOP_LEVEL_2 = [DS_SEGMENTS, DS_RAW, DS_RESIDUALS]
SEGMENTS = [SEG_1, SEG_2, SEG_3, SEG_4, SEG_5]
SPLIT = [DS_TRAIN, DS_TEST, DS_VAL]
CLASS = [CLASS_REAL, CLASS_FAKE]

DATASET = [DS_CDFV1, DS_CDFV2]

# Dataset directory setup

In [3]:
for dataset in DATASET:
    for top_level in TOP_LEVEL_1:
        for split in SPLIT:
            for class_dir in CLASS:
                dir = dataset + top_level + split + class_dir

                os.makedirs(dir, exist_ok=True) 

In [6]:
for dataset in DATASET:
    for top_level in TOP_LEVEL_2:
        for segment in SEGMENTS:
            for split in SPLIT:
                for class_dir in CLASS:
                    dir = dataset + top_level + segment + split + class_dir

                    os.makedirs(dir, exist_ok=True) 

## Celeb-DF v1 & v2 Dataset

Create a 'dataset_original' folder inside the 'celeb_df_v1' and 'celeb_df_v2' folder. Copy the three dataset folders: Celeb-real, Celeb-synthesis, and YouTube-real inside this folder, then run the below cell to create the intermediate dataset folders.

### Train, Test, Val Split

#### Test Dataset Creation

In [4]:
def create_test_dataset(dataset_path, split_rule_file):
    file = open(dataset_path + split_rule_file)

    for line in file:
        [vid_class, vid_path] = line.split()
        [_, filename] = vid_path.split('/')

        # Class 0 -> Fake, Class 1 -> Real
        class_dir = CLASS_REAL if int(vid_class) == 1 else CLASS_FAKE

        org_path = dataset_path + DS_ORGINAL + vid_path
        new_path = dataset_path + DS_SPLIT + DS_TEST + class_dir + filename

        shutil.move(org_path, new_path)

In [10]:
create_test_dataset(DS_CDFV1, 'List_of_testing_videos.txt')

In [5]:
create_test_dataset(DS_CDFV2, 'List_of_testing_videos.txt')

#### Train Dataset Creation

In [6]:
def move_to_train(dataset, src_dir, vid_class):
    dst_dir = dataset + DS_SPLIT + DS_TRAIN + vid_class
    
    files = os.listdir(dataset + src_dir)
    for file in files:
        shutil.move(os.path.join(dataset + src_dir, file), dst_dir)

In [45]:
move_to_train(DS_CDFV1, DS_ORGINAL + 'Celeb-real/', CLASS_REAL)
move_to_train(DS_CDFV1, DS_ORGINAL + 'YouTube-real/', CLASS_REAL)
move_to_train(DS_CDFV1, DS_ORGINAL + 'Celeb-synthesis/', CLASS_FAKE)

In [7]:
move_to_train(DS_CDFV2, DS_ORGINAL + 'Celeb-real/', CLASS_REAL)
move_to_train(DS_CDFV2, DS_ORGINAL + 'YouTube-real/', CLASS_REAL)
move_to_train(DS_CDFV2, DS_ORGINAL + 'Celeb-synthesis/', CLASS_FAKE)

#### Validation Dataset Creation

In [8]:
def create_val_dataset(src_dir, dst_dir):
    random.seed(1)

    filenames = os.listdir(src_dir)

    k = int(len(filenames) * 0.2)

    val_filenames = random.sample(filenames, k)
    for file in val_filenames:
        shutil.move(os.path.join(src_dir, file), dst_dir)

In [75]:
create_val_dataset(DS_CDFV1 + DS_SPLIT + DS_TRAIN + CLASS_REAL, 
                   DS_CDFV1 + DS_SPLIT + DS_VAL + CLASS_REAL)

create_val_dataset(DS_CDFV1 + DS_SPLIT + DS_TRAIN + CLASS_FAKE, 
                   DS_CDFV1 + DS_SPLIT + DS_VAL + CLASS_FAKE)

In [9]:
create_val_dataset(DS_CDFV2 + DS_SPLIT + DS_TRAIN + CLASS_REAL, 
                   DS_CDFV2 + DS_SPLIT + DS_VAL + CLASS_REAL)

create_val_dataset(DS_CDFV2 + DS_SPLIT + DS_TRAIN + CLASS_FAKE, 
                   DS_CDFV2 + DS_SPLIT + DS_VAL + CLASS_FAKE)