In [1]:
# imports
import json
import glob
import os
import pandas as pd
from collections import defaultdict

In [2]:
DATASET_DIR = "/scratch-shared/scur0555/datasets"
DATASET = "df40"
DATASET_PATH = f"{DATASET_DIR}/{DATASET}/test"
DATASET_TYPE = "MidJourney"

In [12]:
def list_files(startpath, max_depth=None):
    for root, dirs, files in os.walk(startpath):
        # Calculate current level
        level = root.replace(startpath, '').count(os.sep)
        
        # If we exceed max_depth, remove directories from further walking
        if max_depth is not None and level >= max_depth:
            dirs[:] = []  # This effectively stops os.walk from going deeper
        
        indent = ' ' * 4 * level
        print('{}{}/'.format(indent, os.path.basename(root)))
        
        subindent = ' ' * 4 * (level + 1)
        for f in files:
            print('{}{}'.format(subindent, f))

### DF40

In [13]:
"""
/scratch-shared/scur0555/datasets/df40
│
├── test
│   ├── fsgan
│       ├── ff
│       └── cdf
│   ├── faceswap
│       ├── ff
│       └── cdf
│   ├── simswap
│       ├── ff
│       └── cdf
│   ├── inswap
│       ├── ff
│       └── cdf
│   ├── blendface
│       ├── ff
│       └── cdf
│   ├── uniface
│       ├── ff
│       └── cdf
│   ├── mobileswap
│       ├── ff
│       └── cdf
│   ├── e4s
│       ├── ff
│       └── cdf
│   ├── facedancer
│       ├── ff
│       └── cdf
│   ├── fomm
│       ├── ff
│       └── cdf
│   ├── facevid2vid
│       ├── ff
│       └── cdf
│   ├── wav2lip
│       ├── ff
│       └── cdf
│   ├── MRAA
│       ├── ff
│       └── cdf
│   ├── one_shot_free
│       ├── ff
│       └── cdf
│   ├── pirender
│       ├── ff
│       └── cdf
│   ├── tpsm
│       ├── ff
│       └── cdf
│   ├── lia
│       ├── ff
│       └── cdf
│   ├── danet
│       ├── ff
│       └── cdf
│   ├── sadtalker
│       ├── ff
│       └── cdf
│   ├── mcnet
│       ├── ff
│       └── cdf
│   ├── heygen
│       ├── fake
│       └── real
│   ├── VQGAN
│       ├── ff
│       └── cdf
│   ├── StyleGAN2
│       ├── ff
│       └── cdf
│   ├── StyleGAN3
│       ├── ff
│       └── cdf
│   ├── StyleGANXL
│       ├── ff
│       └── cdf
│   ├── sd2.1
│       ├── ff
│       └── cdf
│   ├── ddim
│       ├── ff
│       └── cdf
│   ├── PixArt
│       ├── ff
│       └── cdf
│   ├── DiT
│       ├── ff
│       └── cdf
│   ├── SiT
│       ├── ff
│       └── cdf
│   ├── MidJourney
│       ├── fake
│       └── real
│   ├── whichfaceisreal
│       ├── fake
│       └── real
│   ├── stargan
│       ├── fake
│       └── real
│   ├── starganv2
│       ├── fake
│       └── real
│   ├── styleclip
│       ├── fake
│       └── real
│   ├── e4e
│       ├── fake
│       └── real
│   └── CollabDiff
│       ├── fake
│       └── real
"""

'\n/scratch-shared/scur0555/datasets/df40\n│\n├── test\n│   ├── fsgan\n│       ├── ff\n│       └── cdf\n│   ├── faceswap\n│       ├── ff\n│       └── cdf\n│   ├── simswap\n│       ├── ff\n│       └── cdf\n│   ├── inswap\n│       ├── ff\n│       └── cdf\n│   ├── blendface\n│       ├── ff\n│       └── cdf\n│   ├── uniface\n│       ├── ff\n│       └── cdf\n│   ├── mobileswap\n│       ├── ff\n│       └── cdf\n│   ├── e4s\n│       ├── ff\n│       └── cdf\n│   ├── facedancer\n│       ├── ff\n│       └── cdf\n│   ├── fomm\n│       ├── ff\n│       └── cdf\n│   ├── facevid2vid\n│       ├── ff\n│       └── cdf\n│   ├── wav2lip\n│       ├── ff\n│       └── cdf\n│   ├── MRAA\n│       ├── ff\n│       └── cdf\n│   ├── one_shot_free\n│       ├── ff\n│       └── cdf\n│   ├── pirender\n│       ├── ff\n│       └── cdf\n│   ├── tpsm\n│       ├── ff\n│       └── cdf\n│   ├── lia\n│       ├── ff\n│       └── cdf\n│   ├── danet\n│       ├── ff\n│       └── cdf\n│   ├── sadtalker\n│       ├── ff\n│       └──

In [14]:
# dataset structure
glob.glob(f"{DATASET_DIR}/{DATASET}/**")

['/scratch-shared/scur0555/datasets/df40/train',
 '/scratch-shared/scur0555/datasets/df40/configs',
 '/scratch-shared/scur0555/datasets/df40/val',
 '/scratch-shared/scur0555/datasets/df40/test']

In [15]:
# config files used while loading the presets
configs = glob.glob(f"{DATASET_DIR}/{DATASET}/configs/*.json")
print("Total Configs", len(configs))
print(f"{DATASET_DIR}/{DATASET}/configs/")

configs.sort()
config_data = defaultdict(list)
for cfg in configs:
    # print("\t", os.path.basename(cfg))
    # ignore DF40 all config
    if cfg.endswith("DF40_all.json"): 
        continue
    cfg_name = os.path.basename(cfg)
    config_data["dataset"].append(cfg_name.split(".")[0].split("_")[0])
    config_data["config"].append(cfg_name)
    config_data["source"].append(cfg_name.split("_")[-1].split(".")[0])

config_df = pd.DataFrame(config_data)
# grouped_df = config_df.groupby('dataset')

print(config_df.set_index(['dataset', 'config']).sort_index())

Total Configs 82
/scratch-shared/scur0555/datasets/df40/configs/
                                   source
dataset     config                       
CollabDiff  CollabDiff.json    CollabDiff
DiT         DiT_cdf.json              cdf
            DiT_ff.json                ff
EFSAll      EFSAll_cdf.json           cdf
            EFSAll_ff.json             ff
...                                   ...
uniface     uniface_ff.json            ff
            uniface_ori.json          ori
wav2lip     wav2lip_cdf.json          cdf
            wav2lip_ff.json            ff
whichisreal whichisreal.json  whichisreal

[81 rows x 1 columns]


In [16]:
config_names = config_df["dataset"].unique()
print(config_names)
print("Total Unique Datasets:", len(config_names))

['CollabDiff' 'DiT' 'EFSAll' 'FRAll' 'FSAll' 'MRAA' 'MidJourney' 'SiT'
 'StyleGAN2' 'StyleGAN3' 'StyleGANXL' 'VQGAN' 'blendface' 'danet' 'ddim'
 'deepfacelab' 'e4e' 'e4s' 'facedancer' 'faceswap' 'facevid2vid' 'fomm'
 'fsgan' 'heygen' 'hyperreenact' 'inswap' 'lia' 'mcnet' 'mobileswap' 'one'
 'pirender' 'pixart' 'rddm' 'sadtalker' 'sd2' 'simswap' 'stargan'
 'starganv2' 'styleclip' 'tpsm' 'uniface' 'wav2lip' 'whichisreal']
Total Unique Datasets: 43


In [17]:
datasets = [os.path.basename(p) for p in glob.glob(f"{DATASET_PATH}/*") if not p.endswith(('.zip', '.json'))]
print("Total Datasets: ", len(datasets))

Total Datasets:  37


In [18]:
# missing data
set(config_names) - set(datasets)

{'EFSAll',
 'FRAll',
 'FSAll',
 'e4e',
 'one',
 'pixart',
 'rddm',
 'sd2',
 'whichisreal'}

In [19]:
ff_cdf_list = [
    'fsgan',
    'faceswap',
    'simswap',
    'inswap',
    'blendface',
    'uniface',
    'mobileswap',
    'e4s',
    'facedancer',
    'fomm',
    'facevid2vid',
    'wav2lip',
    'MRAA',
    'one_shot_free',
    'pirender',
    'tpsm',
    'lia',
    'danet',
    'sadtalker',
    'mcnet',
    'VQGAN',
    'StyleGAN2',
    'StyleGAN3',
    'StyleGANXL',
    'sd2.1',
    'ddim',
    'PixArt',
    'DiT',
    'SiT'
]

real_fake_list = [
    'heygen',
    'MidJourney',
    'whichfaceisreal',
    'stargan',
    'starganv2',
    'styleclip',
    'CollabDiff'
]


In [40]:
len(glob.glob(f"{DATASET_PATH}/whichisreal/real/*.jpeg"))

1000

### EFS Split

In [3]:
with open("/scratch-shared/scur0555/datasets/df40/configs/DF40_all.json", "r") as f:
    data = json.load(f)

In [4]:
train_list = list(data["DF40_all"]["DF40All_Real"]["train"].keys())
# data["DF40_all"]["DF40All_Real"]["train"]["071_inswap"]
# train_list
unique_train_list = set([data["DF40_all"]["DF40All_Real"]["train"][name]["label"] for name in train_list])
unique_train_list

{'CollabDiff_Real',
 'EFSAll_Real',
 'FRAll_Real',
 'FSAll_Real',
 'MidJourney_Real',
 'deepfacelab_Real',
 'heygen_Real',
 'starganv2_Real',
 'styleclip_Real',
 'whichisreal_Real'}

In [5]:
with open("/scratch-shared/scur0555/datasets/df40/configs/EFSAll_ff.json", "r") as f:
    data = json.load(f)

In [6]:
data["EFSAll_ff"]["EFSAll_Real"].keys()

dict_keys(['train', 'test', 'val'])

In [7]:
for d in sorted(data["EFSAll_ff"]["EFSAll_Real"]["val"]):
    print(data["EFSAll_ff"]["EFSAll_Real"]["val"][d])
    break

for d in sorted(data["EFSAll_ff"]["EFSAll_Real"]["test"]):
    print(data["EFSAll_ff"]["EFSAll_Real"]["test"][d])
    break

{'label': 'EFSAll_Real', 'frames': ['deepfakes_detection_datasets/FaceForensics++/original_sequences/youtube/c23/frames/000/254.png', 'deepfakes_detection_datasets/FaceForensics++/original_sequences/youtube/c23/frames/000/050.png', 'deepfakes_detection_datasets/FaceForensics++/original_sequences/youtube/c23/frames/000/101.png', 'deepfakes_detection_datasets/FaceForensics++/original_sequences/youtube/c23/frames/000/305.png', 'deepfakes_detection_datasets/FaceForensics++/original_sequences/youtube/c23/frames/000/152.png', 'deepfakes_detection_datasets/FaceForensics++/original_sequences/youtube/c23/frames/000/114.png', 'deepfakes_detection_datasets/FaceForensics++/original_sequences/youtube/c23/frames/000/293.png', 'deepfakes_detection_datasets/FaceForensics++/original_sequences/youtube/c23/frames/000/140.png', 'deepfakes_detection_datasets/FaceForensics++/original_sequences/youtube/c23/frames/000/369.png', 'deepfakes_detection_datasets/FaceForensics++/original_sequences/youtube/c23/frame

In [8]:
subset = ["pixart", "SiT", "VQGAN", "StyleGANXL"]

def print_stats(split):    
    print(split.capitalize())
    
    real_keys = [
        k for k in data["EFSAll_ff"]["EFSAll_Real"][split]
        if any(sub in k for sub in subset)
    ]
    real_images = [
        len(data["EFSAll_ff"]["EFSAll_Real"][split][k]['frames'])
        for k in real_keys
    ]
    real_total = sum(real_images)
    
    fake_keys = [
        k for k in data["EFSAll_ff"]["EFSAll_Fake"][split]
        if any(sub in k for sub in subset)
    ]
    fake_images = [
        len(data["EFSAll_ff"]["EFSAll_Fake"][split][k]['frames'])
        for k in fake_keys
    ]
    fake_total = sum(fake_images)
    
    print(f"Total Real Frames: {real_total}")
    print(f"Total Fake Frames: {fake_total}")
    print(f"TotalFrames: {real_total+fake_total}")
    unique_ds = set([name.split("_")[-1] for name in real_keys])
    print(unique_ds, len(unique_ds))

print_stats("train"), print_stats("test"), print_stats("val")

Train
Total Real Frames: 91972
Total Fake Frames: 72028
TotalFrames: 164000
{'VQGAN', 'SiT', 'pixart', 'StyleGANXL'} 4
Test
Total Real Frames: 17916
Total Fake Frames: 14844
TotalFrames: 32760
{'VQGAN', 'SiT', 'pixart', 'StyleGANXL'} 4
Val
Total Real Frames: 17916
Total Fake Frames: 14844
TotalFrames: 32760
{'VQGAN', 'SiT', 'pixart', 'StyleGANXL'} 4


(None, None, None)

In [9]:
# real_keys = data["EFSAll_ff"]["EFSAll_Real"]["train"].keys()
train_list = list()
unique_train_list = set([name.split("_")[-1] for name in train_list])
unique_train_list

set()

In [40]:
if "pixart" not in subset:
    print("yes")

yes


In [36]:
count = 0
sub_dataset_info = data["EFSAll_ff"]["EFSAll_Real"]["train"]
for video_name, video_info in sub_dataset_info.items():
    if not any(sub in video_name for sub in subset):
        continue
    count += 1

count

2876