In [None]:
from dataloader import ValidationDataset, LibriSpeechLoader, VoxCelebLoader, RandomTripletLossDataset, BSILoader
from utils import load_deepfake_dataset
import pandas as pd
import random

In [None]:
def create(labels, name, sete, LOADER):
    dataset = ValidationDataset(loader=LOADER(labels, lambda x: x, 0))
    df = dataset.data_list

    # Create a list to hold the new dataset
    new_dataset = []

    # Group by speaker to get all utterances for each speaker
    speaker_groups = df.groupby('speaker')

    # Function to get a random utterance from the same speaker
    def get_random_same_speaker(speaker, current_utterance):
        speaker_data = speaker_groups.get_group(speaker)
        same_speaker_utterance = speaker_data[speaker_data['utterance'] != current_utterance].sample(1)['utterance'].values[0]
        return same_speaker_utterance

    # Function to get a random utterance from a different speaker
    def get_random_different_speaker(speaker):
        different_speaker = random.choice([spk for spk in df['speaker'].unique() if spk != speaker])
        different_speaker_utterance = df[df['speaker'] == different_speaker].sample(1)['utterance'].values[0]
        return different_speaker_utterance

    # Loop through each row in the dataset
    for index, row in df.iterrows():
        utterance = row['utterance']
        speaker = row['speaker']
        
        # Get a random utterance from the same speaker
        same_speaker_utterance = get_random_same_speaker(speaker, utterance)
        new_dataset.append([utterance, same_speaker_utterance, 1])
        
        # Get a random utterance from a different speaker
        different_speaker_utterance = get_random_different_speaker(speaker)
        new_dataset.append([utterance, different_speaker_utterance, 0])

    # Create a new DataFrame from the new_dataset list
    new_df = pd.DataFrame(new_dataset, columns=['utterance', 'utterance_to_check', 'is_same_speaker'])

    # Save the new DataFrame to a CSV file
    new_df.to_csv(f'../validation_sets/{sete}/{name}.csv', index=False)

In [None]:
train_labels, valid_labels, test_labels = load_deepfake_dataset("LibriSpeech")
create(train_labels, "train", "LibriSpeech", LibriSpeechLoader)

In [None]:
create(valid_labels, "valid", "LibriSpeech", LibriSpeechLoader)

In [None]:
create(test_labels, "test", "LibriSpeech", LibriSpeechLoader)

In [None]:
train_labels, valid_labels, test_labels = load_deepfake_dataset("VoxCeleb")
create(train_labels, "train", "VoxCeleb", VoxCelebLoader)

In [None]:
create(valid_labels, "valid", "VoxCeleb", VoxCelebLoader)

In [None]:
create(test_labels, "test", "VoxCeleb", VoxCelebLoader)
# use https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test.txt

In [None]:
def create_deepfake(labels, name, sete, LOADER):
    dataset = RandomTripletLossDataset(loader=LOADER(labels, lambda x: x, 0))
    df = dataset.genuine
    all = dataset.data_list
    all = all[all["is_genuine"] == 0]

    # Create a list to hold the new dataset
    new_dataset = []

    # Group by speaker to get all utterances for each speaker
    speaker_groups = df.groupby('speaker')

    # Function to get a random utterance from the same speaker
    def get_random_same_speaker(speaker, current_utterance):
        speaker_data = speaker_groups.get_group(speaker)
        same_speaker_utterance = speaker_data[speaker_data['utterance'] != current_utterance].sample(1)['utterance'].values[0]
        same_speaker_method_name = speaker_data[speaker_data['utterance'] != current_utterance].sample(1)['method_name'].values[0]
        return same_speaker_utterance, same_speaker_method_name

    # Function to get a random utterance from a different speaker
    def get_random_different_speaker(speaker):
        different_speaker_utterance = all[all['speaker'] == speaker].sample(1)['utterance'].values[0]
        different_speaker_method_name = all[all['speaker'] == speaker].sample(1)['method_name'].values[0]
        return different_speaker_utterance, different_speaker_method_name

    # Loop through each row in the dataset
    for index, row in df.iterrows():
        utterance = row['utterance']
        speaker = row['speaker']
        method_name = row['method_name']
        
        # Get a random utterance from the same speaker
        same_speaker_utterance, same_speaker_method_name = get_random_same_speaker(speaker, utterance)
        new_dataset.append([utterance, method_name, same_speaker_utterance, same_speaker_method_name, 1])
        
        # Get a random utterance from a different speaker
        different_speaker_utterance, different_speaker_method_name = get_random_different_speaker(speaker)
        new_dataset.append([utterance, method_name, different_speaker_utterance, different_speaker_method_name, 0])

    # Create a new DataFrame from the new_dataset list
    new_df = pd.DataFrame(new_dataset, columns=['utterance', 'method_name', 'utterance_to_check', 'method_name_to_check', 'is_same_speaker'])

    # Save the new DataFrame to a CSV file
    new_df.to_csv(f'../validation_sets/{sete}/{name}.csv', index=False)

In [None]:
train_labels, valid_labels, test_labels = load_deepfake_dataset("BSI")
create_deepfake(train_labels, "train", "BSI", BSILoader)

In [None]:
create_deepfake(valid_labels, "valid", "BSI", BSILoader)

In [None]:
create_deepfake(test_labels, "test", "BSI", BSILoader)