# Data Preparation & Processing

In [None]:
import os
from pathlib import Path
import pandas as pd
import torchaudio

VOX_PATH = "/VOXCELEB/wav"
audio_files = []
speaker_ids = []
durations = []

for speaker_dir in Path(VOX_PATH).iterdir():
    if speaker_dir.is_dir():
        for audio_path in speaker_dir.rglob("*.wav"):
            try:
                info = torchaudio.info(str(audio_path))
                duration = info.num_frames / info.sample_rate
            except Exception as e:
                duration = None
            audio_files.append(str(audio_path))
            speaker_ids.append(speaker_dir.name)
            durations.append(duration)

df = pd.DataFrame({
    "file_path": audio_files,
    "speaker_id": speaker_ids,
    "duration": durations
})

N = 10
df_sampled = df.groupby("speaker_id").sample(n=min(N, len(df)), random_state=42)

df_sampled.to_csv("voxceleb_samples.csv", index=False)


# ECAPA-TDNN Feature Extraction & Feature Database Establishment

In [None]:
import pandas as pd
import torch
from speechbrain.pretrained import EncoderClassifier
from tqdm import tqdm

classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", run_opts={"device":"cpu"})

df = pd.read_csv("/voxceleb_samples.csv")

speaker_embeds = {}

for speaker_id, group in tqdm(df.groupby("speaker_id")):
    embeds = []
    for file_path in group["file_path"]:
        signal = classifier.load_audio(file_path)
        embedding = classifier.encode_batch(signal.unsqueeze(0))
        embeds.append(embedding.squeeze(0).detach().numpy())
    speaker_embeds[speaker_id] = torch.tensor(embeds).mean(dim=0).numpy()


import numpy as np
np.savez("speaker_embeddings.npz", **speaker_embeds)


# Mapping Dictionary Construction

In [None]:
import pandas as pd

meta = pd.read_csv('/Users/liuxiaoyue/Documents/TL/Project_root/vox1_meta.csv', sep='\t')
id2name = pd.Series(meta['VGGFace1 ID'].values, index=meta['VoxCeleb1 ID']).to_dict()


# Most Similar Celebrity Match

In [None]:
import numpy as np
from speechbrain.pretrained import EncoderClassifier

data = np.load("speaker_embeddings.npz")
speaker_ids = list(data.keys())
embeds = np.array([data[sid].squeeze() for sid in speaker_ids]) 

classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", run_opts={"device":"cpu"})

def extract_embedding(audio_path):
    signal = classifier.load_audio(audio_path)
    emb = classifier.encode_batch(signal.unsqueeze(0)).squeeze().detach().numpy() 
    return emb

def match_top3(audio_path):
    query_emb = extract_embedding(audio_path)
    sim = np.dot(embeds, query_emb) / (np.linalg.norm(embeds, axis=1) * np.linalg.norm(query_emb))
    top3_idx = np.argsort(sim)[-3:][::-1]
    results = []
    for i in top3_idx:
        results.append(speaker_ids[i])
    return results


    return results


print(match_top3("query audio.wav"))


# Top-1 / Top-3 Accuracy Evaluation

In [None]:
import os
from tqdm import tqdm

test_root = "/VOXCELEB/wav"

top1_correct = 0
top3_correct = 0
total = 0

for speaker_id in tqdm(os.listdir(test_root)):
    speaker_dir = os.path.join(test_root, speaker_id)
    if not os.path.isdir(speaker_dir):
        continue
    for session in os.listdir(speaker_dir):
        session_dir = os.path.join(speaker_dir, session)
        if not os.path.isdir(session_dir):
            continue
        for wavfile in os.listdir(session_dir):
            if not wavfile.endswith(".wav"):
                continue
            audio_path = os.path.join(session_dir, wavfile)
            results = match_top3(audio_path)
            true_id = speaker_id
            total += 1
            if results[0] == true_id:
                top1_correct += 1
            if true_id in results:
                top3_correct += 1
            if true_id not in results:
                print(f"Audio: {audio_path} | Real ID: {true_id} | Predicted Top 3: {results}")

if total == 0:
    print("No test samples found, please check the directory structure and file names!")
else:
    print(f"Total test samples: {total}")
    print(f"Top-1 accuracy: {top1_correct / total:.3%}")
    print(f"Top-3 accuracy: {top3_correct / total:.3%}")
