In [None]:
from tqdm import tqdm
import numpy as np
import torchaudio
import torch
import os
import wespeaker
import pandas as pd
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import os
import subprocess
import sys 


In [None]:
def scan_directory_voxceleb1(test_dir):
    data = []
    for person_id in os.listdir(test_dir):
        person_path = os.path.join(test_dir, person_id)
        if os.path.isdir(person_path):
            for utterance_env in os.listdir(person_path):
                utterance_path = os.path.join(person_path, utterance_env)
                if os.path.isdir(utterance_path):
                    for file in os.listdir(utterance_path):
                        file_path = os.path.join(utterance_path, file)
                        if os.path.isfile(file_path):
                            # Assuming embedding is a placeholder for actual embedding extraction
                            embedding = "embedding_placeholder"
                            waveform, sample_rate = torchaudio.load(file_path)
                            duration = waveform.shape[1] / sample_rate
                            data.append(
                                [file_path, person_id, utterance_env, file, embedding, duration]
                            )

    df = pd.DataFrame(
        data,
        columns=[
            "path",
            "person_id",
            "utterance_env",
            "utterance_filename",
            "embedding",
            "duration",
        ],
    )
    return df


# Example usage
test_dir = "../data/vox1_test_wav"
df = scan_directory_voxceleb1(test_dir)

In [None]:
df.describe()

In [None]:
def scan_directory_voxceleb2(test_dir):
    data = []
    for person_id in os.listdir(test_dir):
        person_path = os.path.join(test_dir, person_id)
        if os.path.isdir(person_path):
            for file in os.listdir(person_path):
                file_path = os.path.join(person_path, file)
                if os.path.isfile(file_path):
                    # Assuming embedding is a placeholder for actual embedding extraction
                    embedding = "embedding_placeholder"
                    waveform, sample_rate = torchaudio.load(file_path)
                    duration = waveform.shape[1] / sample_rate
                    data.append([file_path, person_id, file, embedding, duration])

    df = pd.DataFrame(
        data, columns=["path", "person_id", "utterance_filename", "embedding", "duration"]
    )
    return df


# Example usage
test_dir = "../data/vox1_test_wav_windowed"
df = scan_directory_voxceleb2(test_dir)

In [None]:
df.describe()

In [None]:
import matplotlib.pyplot as plt

# Function to load waveforms and calculate durations


# Load waveforms and calculate durations


# Plot the durations
plt.figure(figsize=(10, 6))
plt.hist(df['duration'], bins=50, edgecolor='black')
plt.title('Distribution of Audio Clip Durations')
plt.xlabel('Duration (seconds)')
plt.ylabel('Frequency')
plt.show()

In [None]:
df.describe()

In [None]:
window_size = 8
total_files = 0
for duration in df['duration']:
    num_windows = int(duration // window_size)
    if duration % window_size > 0:
        num_windows += 1
    total_files += num_windows

total_files

In [None]:
from numpy import full


def repeat_to_max_len(data, max_len):
    """Repeat to a single wave to the specified length.

    Args:
        data: torch.Tensor (random len)
        max_len: maximum length to repeat or cut the data

    Returns:
        torch.Tensor (repeated to max_len)
    """
    data_len = data.shape[1]

    if data_len == 0:
        raise ValueError("data_len should not be zero")

    if data_len < max_len:
        repeats = max_len // data_len
        remainder = max_len % data_len
        data = torch.cat([data] * repeats, dim=1)
        if remainder > 0:
            data = torch.cat([data, data[:, :remainder]], dim=1)
    return data


def allign_dataframe_durations_celeb2(df, window_size, new_dataset_dir):
    if not os.path.exists(new_dataset_dir):
        os.makedirs(new_dataset_dir)

    new_data = []
    for index, row in df.iterrows():
        person_dir = os.path.join(new_dataset_dir, row["person_id"])
        if not os.path.exists(person_dir):
            os.makedirs(person_dir)

        waveform, sample_rate = torchaudio.load(row["path"])
        max_len = window_size * sample_rate  # 8 * 16000 = 128000

        # if it is less than 8 seconds - repeat to 8 seconds
        if waveform.shape[1] < max_len:
            waveform = repeat_to_max_len(waveform, max_len)
            file_name = row['utterance_filename']
            full_path = os.path.join(person_dir, file_name) 
            torchaudio.save(full_path, waveform, sample_rate)
            new_data.append(
                [
                    os.path.join(person_dir, f"{row['utterance_filename']}.wav"),
                    row["person_id"],
                    f"{row['utterance_filename']}.wav"
                ]
            )
        else:
            # if it is more, lets devide it into 8 second windows, and last one will be less than 8 seconds so repeat it to 8 seconds
            num_windows = waveform.shape[1] // max_len
            remainder = waveform.shape[1] % max_len

            for i in range(num_windows):
                start = i * max_len
                end = (i + 1) * max_len
                window = waveform[:, start:end]
                file_name = row["utterance_filename"].replace(".wav", f"_{i}.wav")
                full_path = os.path.join(person_dir, file_name)
                torchaudio.save(full_path, window, sample_rate)
                new_data.append(
                    [
                        full_path,  
                        row["person_id"],
                        file_name
                    ]
                )
            if remainder > 0:
                start = max_len * num_windows
                end = waveform.shape[1]

                window = waveform[:, start:end]
                window = repeat_to_max_len(window, max_len)
                file_name = row["utterance_filename"].replace(".wav", f"_{num_windows}.wav")
                full_path = os.path.join(person_dir, file_name)
                torchaudio.save(full_path, window, sample_rate)
                new_data.append(
                    [
                        full_path,
                        row["person_id"],
                        file_name
                    ]
                )

    new_df = pd.DataFrame(
        new_data,
        columns=["path", "person_id", "utterance_filename"],
    )
    return new_df


# Example usage
window_size = 8
new_dataset_dir = "../data/vox1_test_wav_windowed"
new_df = allign_dataframe_durations_celeb2(df, window_size, new_dataset_dir)

## TODO if you do not specify environment dirs it will override files....

In [None]:
new_df.head(20)

In [None]:
new_df.info()
unique_paths = new_df['path'].unique()
len(unique_paths)