In [1]:
from utils import load_deepfake_dataset
from extraction_utils.data_utils import read_label_file
from typing import List, Tuple
import pandas as pd
import os

def read(train_labels):
    train_list: List[Tuple[str, int]] = read_label_file(train_labels)
    train_list = pd.DataFrame(train_list, columns=["filename", "is_genuine", "method_type", "method_name", "vocoder"])
    train_list["utterance"] = train_list["filename"].apply(lambda x: os.path.basename(x).split(".")[0])
    train_list["speaker"] = train_list["utterance"].apply(lambda x: x.split("_")[0])
    return train_list


train_labels, dev_labels, test_labels = load_deepfake_dataset()

TRAIN_LIST = read(train_labels)
DEV_LIST = read(dev_labels)
TEST_LIST = read(test_labels)

!!!!!!Unknown: labels_BSI_train_genuine.txt
number of deepfakes: 688432
number of genuine: 8883
!!!!!!Unknown: labels_BSI_valid_genuine.txt
number of deepfakes: 335098
number of genuine: 987
!!!!!!Unknown: labels_BSI_test_genuine.txt
number of deepfakes: 223015
number of genuine: 3284


In [2]:
# Function to get speaker statistics
def get_speaker_stats(train_speakers, other_speakers):
    unique_speakers = len(other_speakers)
    speakers_in_train = pd.Series(other_speakers).isin(train_speakers).sum()
    speakers_not_in_train = unique_speakers - speakers_in_train
    # Assuming train_speakers and other_speakers are defined as pandas Series
    speakers_in_train_not_in_other = len(pd.Series(train_speakers)[~pd.Series(train_speakers).isin(other_speakers)])
    return unique_speakers, speakers_not_in_train, speakers_in_train, speakers_in_train_not_in_other

train_list = TRAIN_LIST[TRAIN_LIST["is_genuine"] == 1]
dev_list = DEV_LIST[DEV_LIST["is_genuine"] == 1]
test_list = TEST_LIST[TEST_LIST["is_genuine"] == 1]

# Extract unique speakers from each list
train_speakers = train_list["speaker"].unique()
dev_speakers = dev_list["speaker"].unique()
test_speakers = test_list["speaker"].unique()

# Get statistics for dev_list and test_list compared to train_list
dev_stats = get_speaker_stats(train_speakers, dev_speakers)
test_stats = get_speaker_stats(train_speakers, test_speakers)

In [3]:
# Display the results
print("Train List Statistics:")
print(f" - Number of unique speakers: {len(train_speakers)}")

print("\nDev List Statistics:")
print(f" - Number of unique speakers: {dev_stats[0]}")
print(f" - Number of speakers not in train_list: {dev_stats[1]}")
print(f" - Number of speakers in train_list: {dev_stats[2]}")
print(f" - Number of speakers in train but not in dev: {dev_stats[3]}")

print("\nTest List Statistics:")
print(f" - Number of unique speakers: {test_stats[0]}")
print(f" - Number of speakers not in train_list: {test_stats[1]}")
print(f" - Number of speakers in train_list: {test_stats[2]}")
print(f" - Number of speakers in train but not in test: {test_stats[3]}")

Train List Statistics:
 - Number of unique speakers: 658

Dev List Statistics:
 - Number of unique speakers: 524
 - Number of speakers not in train_list: 0
 - Number of speakers in train_list: 524
 - Number of speakers in train but not in dev: 134

Test List Statistics:
 - Number of unique speakers: 219
 - Number of speakers not in train_list: 219
 - Number of speakers in train_list: 0
 - Number of speakers in train but not in test: 658


In [4]:
# Extract unique speakers from each list
genuine_list = TRAIN_LIST[TRAIN_LIST["is_genuine"] == 1]
genuine_speakers = genuine_list["speaker"].unique()

deepfake_list = TRAIN_LIST[TRAIN_LIST["is_genuine"] == 0]
deepfake_speakers = deepfake_list["speaker"].unique()

# Get statistics for dev_list and test_list compared to train_list
deepfake_stats = get_speaker_stats(genuine_speakers, deepfake_speakers)

In [5]:
print("\Deepfake Statistics:")
print(f" - Number of unique genuine speakers: {len(genuine_speakers)}")
print(f" - Number of unique deepfake speakers: {len(deepfake_speakers)}")
print(f" - Number of deepfake speakers not in genuine_list: {test_stats[1]}")
print(f" - Number of genuin speakers not in deepfake speakers: {test_stats[3]}")

\Deepfake Statistics:
 - Number of unique genuine speakers: 658
 - Number of unique deepfake speakers: 877
 - Number of deepfake speakers not in genuine_list: 219
 - Number of genuin speakers not in deepfake speakers: 658
