In [1]:
from dataloader import ValidationDataset, LibriSpeechLoader, VoxCelebLoader
from utils import load_deepfake_dataset
import pandas as pd
import random

In [2]:
def create(labels, name, sete, LOADER):
    dataset = ValidationDataset(loader=LOADER(labels, lambda x: x, 0))
    df = dataset.data_list

    # Create a list to hold the new dataset
    new_dataset = []

    # Group by speaker to get all utterances for each speaker
    speaker_groups = df.groupby('speaker')

    # Function to get a random utterance from the same speaker
    def get_random_same_speaker(speaker, current_utterance):
        speaker_data = speaker_groups.get_group(speaker)
        same_speaker_utterance = speaker_data[speaker_data['utterance'] != current_utterance].sample(1)['utterance'].values[0]
        return same_speaker_utterance

    # Function to get a random utterance from a different speaker
    def get_random_different_speaker(speaker):
        different_speaker = random.choice([spk for spk in df['speaker'].unique() if spk != speaker])
        different_speaker_utterance = df[df['speaker'] == different_speaker].sample(1)['utterance'].values[0]
        return different_speaker_utterance

    # Loop through each row in the dataset
    for index, row in df.iterrows():
        utterance = row['utterance']
        speaker = row['speaker']
        
        # Get a random utterance from the same speaker
        same_speaker_utterance = get_random_same_speaker(speaker, utterance)
        new_dataset.append([utterance, same_speaker_utterance, 1])
        
        # Get a random utterance from a different speaker
        different_speaker_utterance = get_random_different_speaker(speaker)
        new_dataset.append([utterance, different_speaker_utterance, 0])

    # Create a new DataFrame from the new_dataset list
    new_df = pd.DataFrame(new_dataset, columns=['utterance', 'utterance_to_check', 'is_same_speaker'])

    # Save the new DataFrame to a CSV file
    new_df.to_csv(f'../validation_sets/{sete}/{name}.csv', index=False)

In [None]:
train_labels, valid_labels, test_labels = load_deepfake_dataset("LibriSpeech")
create(train_labels, "train", "LibriSpeech", LibriSpeechLoader)

In [None]:
create(valid_labels, "valid", "LibriSpeech", LibriSpeechLoader)

In [None]:
create(test_labels, "test", "LibriSpeech", LibriSpeechLoader)

In [6]:
train_labels, valid_labels, test_labels = load_deepfake_dataset("VoxCeleb")
create(train_labels, "train", "VoxCeleb", VoxCelebLoader)

In [4]:
create(valid_labels, "valid", "VoxCeleb", VoxCelebLoader)

In [5]:
create(test_labels, "test", "VoxCeleb", VoxCelebLoader)
# use https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test.txt