# Part I: VoxCeleb Dataset Preparation

In [1]:
# Import necessary libraries
import os
import torch
import torchaudio
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
from pathlib import Path
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import normalize
from transformers import Wav2Vec2FeatureExtractor, WavLMModel, HubertModel, Wav2Vec2Model, UniSpeechSatModel
from peft import LoraConfig, get_peft_model, TaskType
import soundfile as sf
import pickle
import shutil
import zipfile
from google.colab import drive

In [2]:
# Set device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

Using device: cuda


In [3]:
# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%cd /content/drive/MyDrive/SU_Assignment_2/

/content/drive/MyDrive/SU_Assignment_2


In [5]:
%ls

[0m[01;34mvox1[0m/  [01;34mvox2[0m/


In [6]:
%ls vox1

vox1_test_wav.zip


In [7]:
os.makedirs('/content/voxceleb_data/vox1', exist_ok = True)
os.makedirs('/content/voxceleb_data/vox2_aac', exist_ok = True)
os.makedirs('/content/voxceleb_data/vox2_txt', exist_ok = True)

In [8]:
folder = '/content/drive/MyDrive/SU_Assignment_2'
shared_vox1 = f'{folder}/vox1/vox1_test_wav.zip'
shared_vox2_audio = f'{folder}/vox2/vox2_test_aac-002.zip'
shared_vox2_txt = f'{folder}/vox2/vox2_test_txt.zip'

In [9]:
print(f"vox1_test_wav.zip exists: {os.path.exists(shared_vox1)}")
print(f"vox2_test_aac-002.zip exists: {os.path.exists(shared_vox2_audio)}")
print(f"vox2_test_txt.zip exists: {os.path.exists(shared_vox2_txt)}")

vox1_test_wav.zip exists: True
vox2_test_aac-002.zip exists: True
vox2_test_txt.zip exists: True


In [10]:
# Copy files from Drive to local Colab VM
shutil.copy(shared_vox1, "/content/")
shutil.copy(shared_vox2_audio, "/content/")
shutil.copy(shared_vox2_txt, "/content/")

'/content/vox2_test_txt.zip'

In [11]:
# Extract zip files with full paths
zips_and_targets = {
    "/content/vox1_test_wav.zip": "/content/voxceleb_data/vox1",
    "/content/vox2_test_aac-002.zip": "/content/voxceleb_data/vox2_aac",
    "/content/vox2_test_txt.zip": "/content/voxceleb_data/vox2_txt"
}

for zip_file, target_dir in zips_and_targets.items():
    print(f"Extracting {zip_file} to {target_dir}...")
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(target_dir)
    print(f"Done extracting {zip_file}")

Extracting /content/vox1_test_wav.zip to /content/voxceleb_data/vox1...
Done extracting /content/vox1_test_wav.zip
Extracting /content/vox2_test_aac-002.zip to /content/voxceleb_data/vox2_aac...
Done extracting /content/vox2_test_aac-002.zip
Extracting /content/vox2_test_txt.zip to /content/voxceleb_data/vox2_txt...
Done extracting /content/vox2_test_txt.zip


In [12]:
# Convert VoxCeleb2 AAC files to WAV format
aac_root = "/content/voxceleb_data/vox2_aac/aac"
wav_root = "/content/voxceleb_data/vox2_wav"
os.makedirs(wav_root, exist_ok=True)

# Get speaker directories
speaker_dirs = sorted(os.listdir(aac_root))[:118]
print(f"Total speakers (VoxCeleb2): {len(speaker_dirs)}")

for speaker in tqdm(speaker_dirs, desc="Converting AAC to WAV"):
    speaker_path = os.path.join(aac_root, speaker)
    for root, _, files in os.walk(speaker_path):
        for file in files:
            if file.endswith(".m4a"):
                m4a_path = os.path.join(root, file)
                relative_path = os.path.relpath(m4a_path, aac_root)
                wav_path = os.path.join(wav_root, relative_path.replace(".m4a", ".wav"))
                os.makedirs(os.path.dirname(wav_path), exist_ok=True)
                try:
                    waveform, sr = torchaudio.load(m4a_path)
                    torchaudio.save(wav_path, waveform, sr)
                except Exception as e:
                    print(f"Skipped {m4a_path} — {e}")
print("Finished converting VoxCeleb2 AAC files to WAV.")

Total speakers (VoxCeleb2): 118


Converting AAC to WAV: 100%|██████████| 118/118 [05:57<00:00,  3.03s/it]

Finished converting VoxCeleb2 AAC files to WAV.





In [13]:
def load_audio(path, target_sr=16000):
    try:
        waveform, sample_rate = torchaudio.load(path)
        if waveform.shape[0] > 1:  # Convert stereo to mono if needed
            waveform = waveform.mean(dim=0, keepdim=True)
        if sample_rate != target_sr:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)
            waveform = resampler(waveform)
        return waveform.squeeze(0)  # Return as torch.Tensor
    except Exception as e:
        print(f"Error loading {path}: {e}")
        # Fallback to soundfile if torchaudio fails
        audio, sr = sf.read(path)
        if sr != target_sr:
            raise ValueError("Resampling not implemented for fallback method.")
        return torch.tensor(audio, dtype=torch.float32)

In [14]:
class VoxCeleb2Dataset(Dataset):
    def __init__(self, root_dir, identity_list, max_samples_per_identity=20, sample_rate=16000):
        self.sample_rate = sample_rate
        self.samples = []
        self.identity_map = {identity: idx for idx, identity in enumerate(sorted(identity_list))}

        for identity in tqdm(sorted(identity_list), desc="Loading dataset"):
            identity_path = os.path.join(root_dir, identity)
            files = []
            for root, _, filenames in os.walk(identity_path):
                for file in filenames:
                    if file.endswith('.wav'):
                        files.append(os.path.join(root, file))

            # Taking a subset of files for each identity
            files = sorted(files)[:max_samples_per_identity]

            for file_path in files:
                self.samples.append((file_path, self.identity_map[identity]))

        print(f"Dataset created with {len(self.samples)} samples across {len(identity_list)} speakers")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        path, label = self.samples[idx]
        waveform = load_audio(path, target_sr=self.sample_rate)
        # Normalize the waveform
        waveform = waveform / (torch.max(torch.abs(waveform)) + 1e-8)
        return waveform, label

In [15]:
def collate_fn(batch):
    waveforms, labels = zip(*batch)
    # Setting maximum audio length (3 seconds at 16kHz)
    MAX_AUDIO_LENGTH = 16000 * 3

    # Clip or pad waveforms
    clipped_waveforms = [w[:MAX_AUDIO_LENGTH] if w.shape[0] > MAX_AUDIO_LENGTH else w for w in waveforms]
    max_len = max(w.shape[0] for w in clipped_waveforms)

    # Pad all waveforms to the same length
    padded = [F.pad(w, (0, max_len - w.shape[0])) for w in clipped_waveforms]

    return torch.stack(padded), torch.tensor(labels)

In [16]:
def prepare_verification_pairs(veri_txt_path, vox1_root):
    pairs = []
    with open(veri_txt_path, 'r') as f:
        for line in tqdm(f, desc="Preparing verification pairs"):
            parts = line.strip().split()
            label = int(parts[0])
            path1 = os.path.join(vox1_root, parts[1])
            path2 = os.path.join(vox1_root, parts[2])

            # Only add valid pairs where both files exist
            if os.path.exists(path1) and os.path.exists(path2):
                pairs.append((path1, path2, label))

    print(f"Prepared {len(pairs)} verification pairs")
    return pairs

In [17]:
def download_verification_list():
    !wget https://mm.kaist.ac.kr/datasets/voxceleb/meta/veri_test.txt -O /content/veri_test.txt
    return "/content/veri_test.txt"

# Download verification list
veri_txt_path = download_verification_list()

--2025-04-06 02:44:45--  https://mm.kaist.ac.kr/datasets/voxceleb/meta/veri_test.txt
Resolving mm.kaist.ac.kr (mm.kaist.ac.kr)... 143.248.39.47
Connecting to mm.kaist.ac.kr (mm.kaist.ac.kr)|143.248.39.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2338640 (2.2M) [text/plain]
Saving to: ‘/content/veri_test.txt’


2025-04-06 02:44:51 (451 KB/s) - ‘/content/veri_test.txt’ saved [2338640/2338640]



In [18]:
def prepare_dataset_splits(wav_root):
    # Get all speaker identities and sort them
    all_identities = sorted(os.listdir(wav_root))

    train_identities = all_identities[:100]  # First 100 identities for training
    test_identities = all_identities[100:118]  # Next 18 identities for testing

    print(f"Total identities: {len(all_identities)}")
    print(f"Training identities: {len(train_identities)}")
    print(f"Testing identities: {len(test_identities)}")

    return train_identities, test_identities

# Prepare dataset splits
train_identities, test_identities = prepare_dataset_splits(wav_root)

Total identities: 118
Training identities: 100
Testing identities: 18


In [19]:
def create_datasets(wav_root, train_identities, test_identities):
    # Create dataset instances
    train_dataset = VoxCeleb2Dataset(
        root_dir=wav_root,
        identity_list=train_identities,
        max_samples_per_identity=20
    )

    test_dataset = VoxCeleb2Dataset(
        root_dir=wav_root,
        identity_list=test_identities,
        max_samples_per_identity=20
    )

    return train_dataset, test_dataset

# Create datasets
train_dataset, test_dataset = create_datasets(wav_root, train_identities, test_identities)

Loading dataset: 100%|██████████| 100/100 [00:00<00:00, 771.39it/s]


Dataset created with 2000 samples across 100 speakers


Loading dataset: 100%|██████████| 18/18 [00:00<00:00, 736.46it/s]

Dataset created with 360 samples across 18 speakers





In [20]:
def create_data_loaders(train_dataset, test_dataset, batch_size=8):
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=collate_fn,
        num_workers=2
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=collate_fn,
        num_workers=2
    )

    print(f"Created data loaders - Training: {len(train_loader)}, Testing: {len(test_loader)}")
    return train_loader, test_loader

# Create data loaders
train_loader, test_loader = create_data_loaders(train_dataset, test_dataset)

Created data loaders - Training: 250, Testing: 45


In [21]:
# Preparing data for multi-speaker scenarios
def prepare_multispeaker_data(wav_root, train_identities):
    ms_train_identities = train_identities[:50]  # First 50 for multi-speaker training
    ms_test_identities = train_identities[50:100]  # Next 50 for multi-speaker testing

    print(f"Multi-speaker training identities: {len(ms_train_identities)}")
    print(f"Multi-speaker testing identities: {len(ms_test_identities)}")

    # Create multi-speaker datasets
    ms_train_dataset = VoxCeleb2Dataset(
        root_dir=wav_root,
        identity_list=ms_train_identities,
        max_samples_per_identity=20
    )

    ms_test_dataset = VoxCeleb2Dataset(
        root_dir=wav_root,
        identity_list=ms_test_identities,
        max_samples_per_identity=20
    )

    print(f"Multi-speaker training samples: {len(ms_train_dataset)}")
    print(f"Multi-speaker testing samples: {len(ms_test_dataset)}")

    return ms_train_dataset, ms_test_dataset, ms_train_identities, ms_test_identities

# Prepare multi-speaker data
ms_train_dataset, ms_test_dataset, ms_train_identities, ms_test_identities = prepare_multispeaker_data(wav_root, train_identities)

Multi-speaker training identities: 50
Multi-speaker testing identities: 50


Loading dataset: 100%|██████████| 50/50 [00:00<00:00, 888.10it/s]


Dataset created with 1000 samples across 50 speakers


Loading dataset: 100%|██████████| 50/50 [00:00<00:00, 813.70it/s]

Dataset created with 1000 samples across 50 speakers
Multi-speaker training samples: 1000
Multi-speaker testing samples: 1000





In [22]:
def prepare_verification_data(veri_txt_path):
    vox1_wav_root = "/content/voxceleb_data/vox1/wav"
    verification_pairs = prepare_verification_pairs(veri_txt_path, vox1_wav_root)
    return verification_pairs, vox1_wav_root

# Prepare verification data
verification_pairs, vox1_wav_root = prepare_verification_data(veri_txt_path)

Preparing verification pairs: 37720it [00:00, 113016.62it/s]

Prepared 37720 verification pairs





In [23]:
def save_processed_data(wav_root, vox1_wav_root, veri_txt_path, train_identities, test_identities,
                        ms_train_identities, ms_test_identities, train_dataset, test_dataset,
                        ms_train_dataset, ms_test_dataset, verification_pairs):
    os.makedirs('/content/processed_data', exist_ok=True)

    # Save dataset information
    data_info = {
        'vox2_wav_root': wav_root,
        'vox1_wav_root': vox1_wav_root,
        'veri_txt_path': veri_txt_path,
        'train_identities': train_identities,
        'test_identities': test_identities,
        'ms_train_identities': ms_train_identities,
        'ms_test_identities': ms_test_identities,
        'train_samples': [(sample[0], sample[1]) for sample in train_dataset.samples],
        'test_samples': [(sample[0], sample[1]) for sample in test_dataset.samples],
        'ms_train_samples': [(sample[0], sample[1]) for sample in ms_train_dataset.samples],
        'ms_test_samples': [(sample[0], sample[1]) for sample in ms_test_dataset.samples],
        'verification_pairs': verification_pairs
    }

    with open('/content/processed_data/voxceleb_data_info.pkl', 'wb') as f:
        pickle.dump(data_info, f)

    print("Saved dataset information to /content/processed_data/voxceleb_data_info.pkl")

# Save processed data
save_processed_data(wav_root, vox1_wav_root, veri_txt_path, train_identities, test_identities,
                   ms_train_identities, ms_test_identities, train_dataset, test_dataset,
                   ms_train_dataset, ms_test_dataset, verification_pairs)

Saved dataset information to /content/processed_data/voxceleb_data_info.pkl


In [24]:
from google.colab import files
files.download('/content/processed_data/voxceleb_data_info.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>