# Count all the german files in the dataset

In [4]:
import pandas as pd

# Define the path to your CSV file
csv_file_path = "/ceph/shared/ALL/datasets/voxceleb2-V2/voxceleb2-language-identification.csv"

# Read the CSV file
df = pd.read_csv(csv_file_path)

# Count rows where detected_language is 'de'
german_count = df[df['detected_language'] == 'de'].shape[0]

print(f"Total rows: {len(df)}")
print(f"German ('de') files found: {german_count}")

Total rows: 1092009
German ('de') files found: 89700


# generate a csv file of german paths

In [5]:
# ...existing code...
# Filter the DataFrame to get only German rows
german_df = df[df['detected_language'] == 'de']

# Define the output path for the new CSV
output_csv_path = "/ceph/shared/ALL/datasets/voxceleb2-V2/voxceleb2-german-only-whisperLarge.csv"

# Save to CSV (index=False prevents writing the row numbers)
german_df.to_csv(output_csv_path, index=False)

print(f"Saved {len(german_df)} German entries to: {output_csv_path}")

Saved 89700 German entries to: /ceph/shared/ALL/datasets/voxceleb2-V2/voxceleb2-german-only-whisperLarge.csv


# extract the german pairs of audio and video

In [6]:
import pandas as pd
import shutil
import os
from tqdm import tqdm

# --- Configuration ---
csv_file_path = "/ceph/shared/ALL/datasets/voxceleb2-V2/voxceleb2-german-only-whisperLarge.csv"

# Source Roots
src_aac_root = "/ceph/shared/ALL/datasets/voxceleb2-V2/dev/aac"
src_mp4_root = "/ceph/shared/ALL/datasets/voxceleb2-V2/dev/mp4"

# Destination Roots
dst_aac_root = "/ceph/shared/ALL/datasets/voxceleb2-V2/VoxCeleb2-German/dev/aac"
dst_mp4_root = "/ceph/shared/ALL/datasets/voxceleb2-V2/VoxCeleb2-German/dev/mp4"

def copy_german_files(csv_path, src_aac, src_mp4, dst_aac, dst_mp4):
    print(f"Reading CSV: {csv_path}")
    df = pd.read_csv(csv_path)
    
    # We iterate through each row in the CSV
    # Each row contains a full path like: /ceph/.../dev/aac/id00015/vUAbwL9omyM/00452.m4a
    
    print(f"Found {len(df)} files to copy.")
    
    for index, row in tqdm(df.iterrows(), total=len(df), desc="Copying Files"):
        full_audio_path = row['file_path']
        
        # 1. Parse the path to extract ID, Sequence, and Filename
        # We assume the structure ends with: .../aac/<id>/<seq>/<filename>
        parts = full_audio_path.split('/')
        
        filename = parts[-1]       # 00452.m4a
        seq_folder = parts[-2]     # vUAbwL9omyM
        id_folder = parts[-3]      # id00015
        
        # 2. Construct Source Paths
        # Audio is already known
        src_audio = full_audio_path
        
        # Video path: replace 'aac' with 'mp4' and extension .m4a with .mp4
        # We construct it manually to be safe
        video_filename = filename.replace('.m4a', '.mp4')
        src_video = os.path.join(src_mp4, id_folder, seq_folder, video_filename)
        
        # 3. Construct Destination Paths (preserving structure)
        # Audio Destination
        dst_audio_dir = os.path.join(dst_aac, id_folder, seq_folder)
        dst_audio_file = os.path.join(dst_audio_dir, filename)
        
        # Video Destination
        dst_video_dir = os.path.join(dst_mp4, id_folder, seq_folder)
        dst_video_file = os.path.join(dst_video_dir, video_filename)
        
        # 4. Create Directories
        os.makedirs(dst_audio_dir, exist_ok=True)
        os.makedirs(dst_video_dir, exist_ok=True)
        
        # 5. Copy Files
        # Copy Audio
        if os.path.exists(src_audio):
            if not os.path.exists(dst_audio_file):
                try:
                    shutil.copy2(src_audio, dst_audio_file)
                except Exception as e:
                    print(f"Error copying audio {src_audio}: {e}")
        else:
            # print(f"Warning: Source audio missing: {src_audio}")
            pass

        # Copy Video
        if os.path.exists(src_video):
            if not os.path.exists(dst_video_file):
                try:
                    shutil.copy2(src_video, dst_video_file)
                except Exception as e:
                    print(f"Error copying video {src_video}: {e}")
        else:
            # print(f"Warning: Source video missing: {src_video}")
            pass

    print("Copy process completed.")

if __name__ == "__main__":
    copy_german_files(csv_file_path, src_aac_root, src_mp4_root, dst_aac_root, dst_mp4_root)

Reading CSV: /ceph/shared/ALL/datasets/voxceleb2-V2/voxceleb2-german-only-whisperLarge.csv
Found 89700 files to copy.


Copying Files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 89700/89700 [43:31<00:00, 34.34it/s]

Copy process completed.





In [None]:
import os

# --- Configuration ---
# The reference folder (the one that has all 714 items)
ref_path = "/ceph/shared/ALL/datasets/voxceleb2-V2/VoxCeleb2-German/dev/aac"

# The target folder (the one with 711 items)
# UPDATE THIS PATH to the actual location of 'vox2_german_video_seg16s'
target_path = "/ceph/shared/ALL/datasets/voxceleb2-V2/VoxCeleb2-German/dev/processedVideos/vox2_german/vox2_german_video_seg16s" 

# --- Processing ---
# Get the list of items in both directories
# We use set() to make mathematical subtraction easy
ref_items = set(os.listdir(ref_path))
target_items = set(os.listdir(target_path))

# Calculate the difference
missing_items = ref_items - target_items

print(f"Items in Reference (aac): {len(ref_items)}")
print(f"Items in Target (seg16s): {len(target_items)}")
print(f"Number of missing items: {len(missing_items)}")
print("-" * 30)

if len(missing_items) > 0:
    print("The following folders are missing in 'vox2_german_video_seg16s':")
    for item in sorted(missing_items):
        print(item)
else:
    print("No missing items found! The folders match.")

FileNotFoundError: [Errno 2] No such file or directory: '/ceph/shared/ALL/datasets/voxceleb2-V2/vox2_german_video_seg16s'