In [1]:
import os
import shutil
import random

In [4]:
DS_CDFV1 = 'celeb_df_v1/'
DS_CDFV2 = 'celeb_df_v2/'

DS_ORGINAL = 'dataset_original/'
DS_SPLIT = 'dataset_split/'
DS_IFRAMES = 'dataset_iframes/'
DS_FACE = 'dataset_face/'
DS_FACE_IMG = 'dataset_face_img/'
DS_SRM_SNIPPETS = 'dataset_srm_snippets_5/'
DS_SEGMENTS = 'dataset_segments/'
DS_RAW = 'dataset_raw/'
DS_RESIDUALS = 'dataset_residuals/'
DS_TEMPORAL = 'dataset_temporal/'

MODELS = 'models/'


SEG_1 = 'seg_1/'
SEG_2 = 'seg_2/'
SEG_3 = 'seg_3/'
SEG_4 = 'seg_4/'
SEG_5 = 'seg_5/'

SEG = ['seg_1_', 'seg_2_', 'seg_3_', 'seg_4_', 'seg_5_']

DS_TRAIN = 'train_dataset/'
DS_TEST = 'test_dataset/'
DS_VAL = 'val_dataset/'

CLASS_FAKE = 'fake/'
CLASS_REAL = 'real/'


TOP_LEVEL_1 = [DS_SPLIT, DS_IFRAMES, DS_FACE, DS_FACE_IMG, DS_SRM_SNIPPETS]
TOP_LEVEL_2 = [DS_SEGMENTS, DS_RAW, DS_RESIDUALS]
SEGMENTS = [SEG_1, SEG_2, SEG_3, SEG_4, SEG_5]
SPLIT = [DS_TRAIN, DS_TEST, DS_VAL]
CLASS = [CLASS_REAL, CLASS_FAKE]

DATASET = [DS_CDFV1, DS_CDFV2]

# Dataset directory setup

In [3]:
for dataset in DATASET:
    for top_level in TOP_LEVEL_1:
        for split in SPLIT:
            for class_dir in CLASS:
                dir = dataset + top_level + split + class_dir

                os.makedirs(dir, exist_ok=True) 

In [6]:
for dataset in DATASET:
    for top_level in TOP_LEVEL_2:
        for segment in SEGMENTS:
            for split in SPLIT:
                for class_dir in CLASS:
                    dir = dataset + top_level + segment + split + class_dir

                    os.makedirs(dir, exist_ok=True) 

## Celeb-DF v1 & v2 Dataset

Create a 'dataset_original' folder inside the 'celeb_df_v1' and 'celeb_df_v2' folder. Copy the three dataset folders: Celeb-real, Celeb-synthesis, and YouTube-real inside this folder, then run the below cell to create the intermediate dataset folders.

### Train, Test, Val Split

#### Test Dataset Creation

In [5]:
def create_test_dataset(dataset_path, split_rule_file):
    file = open(dataset_path + split_rule_file)

    for line in file:
        [vid_class, vid_path] = line.split()
        [_, filename] = vid_path.split('/')

        # Class 0 -> Fake, Class 1 -> Real
        class_dir = CLASS_REAL if int(vid_class) == 1 else CLASS_FAKE

        org_path = dataset_path + DS_ORGINAL + vid_path
        new_path = dataset_path + DS_SPLIT + DS_TEST + class_dir + filename

        shutil.move(org_path, new_path)

In [10]:
create_test_dataset(DS_CDFV1, 'List_of_testing_videos.txt')

In [7]:
create_test_dataset(DS_CDFV2, 'List_of_testing_videos.txt')

#### Train Dataset Creation

In [8]:
def move_to_train(dataset, src_dir, vid_class):
    dst_dir = dataset + DS_SPLIT + DS_TRAIN + vid_class
    
    files = os.listdir(dataset + src_dir)
    for file in files:
        shutil.move(os.path.join(dataset + src_dir, file), dst_dir)

In [45]:
move_to_train(DS_CDFV1, DS_ORGINAL + 'Celeb-real/', CLASS_REAL)
move_to_train(DS_CDFV1, DS_ORGINAL + 'YouTube-real/', CLASS_REAL)
move_to_train(DS_CDFV1, DS_ORGINAL + 'Celeb-synthesis/', CLASS_FAKE)

In [9]:
move_to_train(DS_CDFV2, DS_ORGINAL + 'Celeb-real/', CLASS_REAL)
move_to_train(DS_CDFV2, DS_ORGINAL + 'YouTube-real/', CLASS_REAL)
move_to_train(DS_CDFV2, DS_ORGINAL + 'Celeb-synthesis/', CLASS_FAKE)

#### Validation Dataset Creation

In [10]:
def create_val_dataset(src_dir, dst_dir):
    random.seed(1)

    filenames = os.listdir(src_dir)

    k = int(len(filenames) * 0.2)

    val_filenames = random.sample(filenames, k)
    for file in val_filenames:
        shutil.move(os.path.join(src_dir, file), dst_dir)

In [75]:
create_val_dataset(DS_CDFV1 + DS_SPLIT + DS_TRAIN + CLASS_REAL, 
                   DS_CDFV1 + DS_SPLIT + DS_VAL + CLASS_REAL)

create_val_dataset(DS_CDFV1 + DS_SPLIT + DS_TRAIN + CLASS_FAKE, 
                   DS_CDFV1 + DS_SPLIT + DS_VAL + CLASS_FAKE)

In [11]:
create_val_dataset(DS_CDFV2 + DS_SPLIT + DS_TRAIN + CLASS_REAL, 
                   DS_CDFV2 + DS_SPLIT + DS_VAL + CLASS_REAL)

create_val_dataset(DS_CDFV2 + DS_SPLIT + DS_TRAIN + CLASS_FAKE, 
                   DS_CDFV2 + DS_SPLIT + DS_VAL + CLASS_FAKE)

# Temporal

In [30]:
def get_video_names(src, k):
    files = os.listdir(src)
    files = set(files)

    # 518 files
    # 340 fake, 178 real
    test_files = random.sample(files, k)


    files = files.symmetric_difference(test_files)
    val_files = random.sample(files, k)

    train_files = files.symmetric_difference(val_files)

    return train_files, val_files, test_files


In [21]:
def group_video_frames(src_dir, dst_dir):

    for video_dir in os.listdir(src_dir):
        frame_dir = src_dir + video_dir + '/seg_1/'

        dst_dir = dst_dir + video_dir + '/'
        print(dst_dir)

        for frame in os.listdir(frame_dir):
            shutil.copy(os.path.join(frame_dir, frame), dst_dir)

        # print(count)
        # print(video_dir)
        break

In [31]:
train_fake, val_fake, test_fake = get_video_names(DS_CDFV2 + DS_TEMPORAL + DS_TEST + SEG_1 + CLASS_FAKE, 68)
train_real, val_real, test_real = get_video_names(DS_CDFV2 + DS_TEMPORAL + DS_TEST + SEG_1 + CLASS_REAL, 35)

since Python 3.9 and will be removed in a subsequent version.
  test_files = random.sample(files, k)
since Python 3.9 and will be removed in a subsequent version.
  val_files = random.sample(files, k)


In [59]:
def copy_videos(name_list, class_dir):
    for seg in SEGMENTS:
        src_base_path = DS_CDFV2 + DS_TEMPORAL + DS_TEST + seg
        dst_base_path = DS_CDFV2 + 'dataset_temporal_2/' + DS_TEST + seg

        vid_dir  = src_base_path + class_dir
        dst_vid_dir = dst_base_path + class_dir

        for vid in name_list:

            # print(vid_dir + vid + '/')
            # print(dst_vid_dir + vid + '/')

            # os.mkdir(dst_vid_dir + vid)

            shutil.copytree(vid_dir + vid + '/', dst_vid_dir + vid + '/', dirs_exist_ok=True)

In [61]:
copy_videos(test_real, CLASS_REAL)

In [108]:
def copy_train_val_videos(name_list, src_dir, class_dir):
    src_base_path = DS_CDFV2 + DS_TEMPORAL + DS_TEST + SEG_1
    dst_base_path = DS_CDFV2 + 'dataset_temporal_2/' + src_dir

    vid_dir = src_base_path + class_dir
    dst_vid_dir = dst_base_path + class_dir

    for vid in name_list:
        src_vid_dir = vid_dir + vid + '/' + SEG_1

        # print(src_vid_dir)
        # print(dst_vid_dir)
        count = 0
        
        for frame in os.listdir(src_vid_dir):
            print(src_vid_dir + frame)
            print(dst_vid_dir + f'{os.path.splitext(vid)[0]}_{count}.jpg')

            shutil.copy(src_vid_dir + frame, dst_vid_dir + f'{os.path.splitext(vid)[0]}_{count}.jpg')
            count += 1        

In [111]:
copy_train_val_videos(val_fake, DS_VAL, CLASS_FAKE)

celeb_df_v2/dataset_temporal/test_dataset/seg_1/fake/id39_id44_0008.mp4/seg_1/frame_208.jpg
celeb_df_v2/dataset_temporal_2/val_dataset/fake/id39_id44_0008_0.jpg
celeb_df_v2/dataset_temporal/test_dataset/seg_1/fake/id39_id44_0008.mp4/seg_1/frame_27.jpg
celeb_df_v2/dataset_temporal_2/val_dataset/fake/id39_id44_0008_1.jpg
celeb_df_v2/dataset_temporal/test_dataset/seg_1/fake/id39_id44_0008.mp4/seg_1/frame_5.jpg
celeb_df_v2/dataset_temporal_2/val_dataset/fake/id39_id44_0008_2.jpg
celeb_df_v2/dataset_temporal/test_dataset/seg_1/fake/id39_id44_0008.mp4/seg_1/frame_108.jpg
celeb_df_v2/dataset_temporal_2/val_dataset/fake/id39_id44_0008_3.jpg
celeb_df_v2/dataset_temporal/test_dataset/seg_1/fake/id39_id44_0008.mp4/seg_1/frame_124.jpg
celeb_df_v2/dataset_temporal_2/val_dataset/fake/id39_id44_0008_4.jpg
celeb_df_v2/dataset_temporal/test_dataset/seg_1/fake/id39_id44_0008.mp4/seg_1/frame_31.jpg
celeb_df_v2/dataset_temporal_2/val_dataset/fake/id39_id44_0008_5.jpg
celeb_df_v2/dataset_temporal/test_data

In [22]:
group_video_frames(DS_CDFV2 + DS_TEMPORAL + DS_TEST + 'seg_1/' + CLASS_FAKE, DS_CDFV2 + 'dataset_temporal_2/' + DS_TRAIN + CLASS_FAKE)

celeb_df_v2/dataset_temporal_2/test_dataset/fake/id25_id28_0003.mp4/
250
id25_id28_0003.mp4
