# DFD Dataset Balancing

This notebook analyzes and balances the DeepFake Detection (DFD) dataset by matching original videos with their manipulated counterparts.

In [11]:
import os
from collections import defaultdict
import shutil

In [12]:
# Define paths
original_path = '../DFD/DFD_original_sequences'
manipulated_path = '../DFD/DFD_manipulated_sequences'

# Get all files
original_files = os.listdir(original_path)
manipulated_files = os.listdir(manipulated_path)

print(f"Total original videos: {len(original_files)}")
print(f"Total manipulated videos: {len(manipulated_files)}")

Total original videos: 364
Total manipulated videos: 3068


In [13]:
def get_base_name(filename):
    """Extract the base name from the video filename (removing ID and hash if present)"""
    if not filename.endswith('.mp4'):
        return None
        
    parts = filename.split('__')
    
    if len(parts) == 2:  # Original video (format: XX__name.mp4)
        # Keep the ID and base name for original videos
        return filename.replace('.mp4', '')
    elif len(parts) == 3:  # Manipulated video (format: XX_YY__name__HASH.mp4)
        # For manipulated, extract the source ID and base name
        id_parts = parts[0].split('_')
        if len(id_parts) >= 1:
            source_id = id_parts[0]  # Get the first ID (source ID)
            return f"{source_id}__{parts[1]}"  # Combine source ID with base name
    return None

# Create dictionaries to store videos by their base names
original_dict = defaultdict(list)
manipulated_dict = defaultdict(list)

# Categorize original videos
for f in original_files:
    base_name = get_base_name(f)
    if base_name:
        original_dict[base_name].append(f)

# Categorize manipulated videos
for f in manipulated_files:
    base_name = get_base_name(f)
    if base_name:
        manipulated_dict[base_name].append(f)

# Find matching pairs
matching_bases = set(original_dict.keys()) & set(manipulated_dict.keys())

print(f"\nUnique original video types: {len(original_dict)}")
print(f"Unique manipulated video types: {len(manipulated_dict)}")
print(f"Matching video types: {len(matching_bases)}")

# Print some examples to verify
print("\nExample categorizations:")
for i, (base, files) in enumerate(original_dict.items()):
    if i >= 3:  # Only show first 3 examples
        break
    print(f"\nBase name: {base}")
    print(f"Original: {files[0]}")
    if base in manipulated_dict:
        print(f"Manipulated: {manipulated_dict[base][0]}")


Unique original video types: 363
Unique manipulated video types: 358
Matching video types: 358

Example categorizations:

Base name: 07__exit_phone_room
Original: 07__exit_phone_room.mp4
Manipulated: 07_21__exit_phone_room__K7KXUHMU.mp4

Base name: 09__kitchen_pan
Original: 09__kitchen_pan.mp4
Manipulated: 09_21__kitchen_pan__Z8H2TRCI.mp4

Base name: 02__walking_down_street_outside_angry
Original: 02__walking_down_street_outside_angry.mp4
Manipulated: 02_07__walking_down_street_outside_angry__O4SXNLRL.mp4


In [14]:
# Analyze the matches in detail
print("Detailed matching analysis:")
print("-" * 50)

for base in sorted(matching_bases):
    orig_count = len(original_dict[base])
    manip_count = len(manipulated_dict[base])
    print(f"{base}:")
    print(f"  Original videos: {orig_count}")
    print(f"  Manipulated versions: {manip_count}")
    
    # Show some examples
    if orig_count > 0 and manip_count > 0:
        print(f"  Example original: {original_dict[base][0]}")
        print(f"  Example manipulated: {manipulated_dict[base][0]}")
    print()

Detailed matching analysis:
--------------------------------------------------
01__exit_phone_room:
  Original videos: 1
  Manipulated versions: 8
  Example original: 01__exit_phone_room.mp4
  Example manipulated: 01_11__exit_phone_room__4OJNJLOO.mp4

01__hugging_happy:
  Original videos: 1
  Manipulated versions: 13
  Example original: 01__hugging_happy.mp4
  Example manipulated: 01_15__hugging_happy__02HILKYO.mp4

01__kitchen_pan:
  Original videos: 1
  Manipulated versions: 8
  Example original: 01__kitchen_pan.mp4
  Example manipulated: 01_21__kitchen_pan__03X7CELV.mp4

01__kitchen_still:
  Original videos: 1
  Manipulated versions: 7
  Example original: 01__kitchen_still.mp4
  Example manipulated: 01_03__kitchen_still__JZUXXFRB.mp4

01__meeting_serious:
  Original videos: 1
  Manipulated versions: 14
  Example original: 01__meeting_serious.mp4
  Example manipulated: 01_20__meeting_serious__D8GWGO2A.mp4

01__outside_talking_pan_laughing:
  Original videos: 1
  Manipulated versions:

In [15]:
def create_balanced_dataset(original_dict, manipulated_dict, matching_bases, output_dir='../DFD/balanced_dataset'):
    """Create a balanced dataset by copying matching pairs to a new directory"""
    # Create output directories
    os.makedirs(os.path.join(output_dir, 'original'), exist_ok=True)
    os.makedirs(os.path.join(output_dir, 'manipulated'), exist_ok=True)
    
    total_pairs = 0
    
    for base in matching_bases:
        orig_videos = original_dict[base]
        manip_videos = manipulated_dict[base]
        
        # Copy one matching pair for each base type
        if orig_videos and manip_videos:
            # Copy original
            src = os.path.join(original_path, orig_videos[0])
            dst = os.path.join(output_dir, 'original', orig_videos[0])
            shutil.copy2(src, dst)
            
            # Copy manipulated
            src = os.path.join(manipulated_path, manip_videos[0])
            dst = os.path.join(output_dir, 'manipulated', manip_videos[0])
            shutil.copy2(src, dst)
            
            total_pairs += 1
    
    print(f"Created balanced dataset with {total_pairs} pairs of videos")
    print(f"Output directory: {output_dir}")

# Create the balanced dataset
create_balanced_dataset(original_dict, manipulated_dict, matching_bases)

Created balanced dataset with 358 pairs of videos
Output directory: ../DFD/balanced_dataset
