In [1]:
from tqdm import tqdm
import numpy as np
import torchaudio
import torch
import os
import wespeaker
import pandas as pd
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import os
import subprocess
import sys 


In [None]:
embeds_dir = "./embeds"

In [20]:
def scan_directory_voxceleb1(test_dir):
    data = []
    for person_id in os.listdir(test_dir):
        person_path = os.path.join(test_dir, person_id)
        if os.path.isdir(person_path):
            for utterance_env in os.listdir(person_path):
                utterance_path = os.path.join(person_path, utterance_env)
                if os.path.isdir(utterance_path):
                    for file in os.listdir(utterance_path):
                        file_path = os.path.join(utterance_path, file)
                        if os.path.isfile(file_path):
                            # Assuming embedding is a placeholder for actual embedding extraction
                            data.append([file_path, person_id, utterance_env, file])
    
    df = pd.DataFrame(data, columns=['path', 'person_id', 'utterance_env', 'utterance_filename'])
    return df

# Example usage
test_dir = '../data/vox1_test_wav'
df = scan_directory_voxceleb1(test_dir)

In [20]:
def scan_directory_voxceleb2(test_dir):
    data = []
    for person_id in os.listdir(test_dir):
        person_path = os.path.join(test_dir, person_id)
        if os.path.isdir(person_path):
            for file in os.listdir(person_path):
                file_path = os.path.join(person_path, file)
                if os.path.isfile(file_path):
                    # Assuming embedding is a placeholder for actual embedding extraction
                    data.append([file_path, person_id, file])

    df = pd.DataFrame(data, columns=['path', 'person_id', 'utterance_filename'])
    return df


In [5]:

# Example usage
test_dir = '../data/voxceleb2_eval_segments'
df = scan_directory_voxceleb2(test_dir)

In [6]:
df

Unnamed: 0,path,person_id,utterance_filename,embedding
0,../data/voxceleb2_eval_segments/id02019/50_seg...,id02019,50_seg3_.wav,embedding_placeholder
1,../data/voxceleb2_eval_segments/id02019/194_se...,id02019,194_seg0_.wav,embedding_placeholder
2,../data/voxceleb2_eval_segments/id02019/196_se...,id02019,196_seg8_.wav,embedding_placeholder
3,../data/voxceleb2_eval_segments/id02019/87_seg...,id02019,87_seg8_.wav,embedding_placeholder
4,../data/voxceleb2_eval_segments/id02019/1_seg9...,id02019,1_seg9_.wav,embedding_placeholder
...,...,...,...,...
11091,../data/voxceleb2_eval_segments/id03347/268_se...,id03347,268_seg8_.wav,embedding_placeholder
11092,../data/voxceleb2_eval_segments/id03347/323_se...,id03347,323_seg4_.wav,embedding_placeholder
11093,../data/voxceleb2_eval_segments/id03347/313_se...,id03347,313_seg17_.wav,embedding_placeholder
11094,../data/voxceleb2_eval_segments/id03347/259_se...,id03347,259_seg15_.wav,embedding_placeholder


In [None]:
def convert_mp4_to_wav(mp4_path, wav_path):
    command = ["ffmpeg", "-i", mp4_path, wav_path]
    subprocess.run(command, check=True)

# Create output directory if it doesn't exist
output_dir = "../data/voxceleb2_wav_eval"
os.makedirs(output_dir, exist_ok=True)

# Iterate through the dataframe and convert each MP4 file to WAV
for index, row in tqdm(df.iterrows(), total=len(df), desc="Converting MP4 to WAV"):
    # Create a directory for each person_id if it doesn't exist
    person_dir = os.path.join(output_dir, row['person_id'])
    os.makedirs(person_dir, exist_ok=True)
    
    mp4_file = row['path']
    wav_file = os.path.join(person_dir, os.path.splitext(os.path.basename(mp4_file))[0] + ".wav")
    convert_mp4_to_wav(mp4_file, wav_file)
    print(f"Converted {mp4_file} to {wav_file}")


In [None]:
df

In [None]:
import torchaudio


class AudioDataset(Dataset):
    def __init__(self, dataframe, max_len):
        self.dataframe = dataframe
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        audio_path = self.dataframe.iloc[idx]['path']
        waveform, sample_rate = torchaudio.load(audio_path)

        waveform = self.pad_or_cut_wave(waveform, max_len)

        sample = {'path': audio_path, 'waveform': waveform, 'sample_rate': sample_rate}

        return sample

    def pad_or_cut_wave(self, data, max_len):
        """Pad or cut a single wave to the specified length.

        Args:
            data: torch.Tensor (random len)
            max_len: maximum length to pad or cut the data

        Returns:
            torch.Tensor (padded or cut to max_len)
        """
        data_len = data.shape[1]
        if data_len < max_len:
            padding = max_len - data_len
            data = torch.nn.functional.pad(data, (0, padding))
        else:
            data = data[:, :max_len]
        return data

# Create an instance of the dataset
# 1 s  = 16_000 samples
max_len = 5 * 16000
audio_dataset = AudioDataset(df, max_len)

In [None]:
from torch.utils.data import DataLoader

# Create a DataLoader for the audio_dataset
audio_dataloader = DataLoader(audio_dataset, batch_size=32, shuffle=False)


In [3]:
campplus_model = wespeaker.load_model("campplus")
campplus_model.set_device("mps")

In [21]:
class AudioDatasetFBank(Dataset):
    def __init__(self, dataframe, max_len, model):
        self.dataframe = dataframe
        self.max_len = max_len
        self.model = model

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        audio_path = self.dataframe.iloc[idx]["path"]
        waveform, sample_rate = torchaudio.load(audio_path)

        waveform = self.pad_or_cut_wave(waveform, max_len)

        # Extract fbank features
        fbank = self.model.compute_fbank(waveform)

        sample = {"path": audio_path, "fbank": fbank, "sample_rate": sample_rate}

        return sample

    def pad_or_cut_wave(self, data, max_len):
        """Pad or cut a single wave to the specified length.

        Args:
            data: torch.Tensor (random len)
            max_len: maximum length to pad or cut the data

        Returns:
            torch.Tensor (padded or cut to max_len)
        """
        data_len = data.shape[1]
        if data_len < max_len:
            padding = max_len - data_len
            data = torch.nn.functional.pad(data, (0, padding))
        else:
            data = data[:, :max_len]
        return data
    

# Create an instance of the dataset
# 1 s  = 16_000 samples
max_len = 5 * 16000
audio_dataset_fbank = AudioDatasetFBank(df, max_len, campplus_model)

# Create a DataLoader for the audio_dataset
audio_dataloader_fbank = DataLoader(audio_dataset_fbank, batch_size=32, shuffle=False)


In [22]:
campplus_model.device

device(type='mps')

In [6]:
def evaluate_CAMPPLUS(we_speaker_model, dataloader):
    all_embeddings = {}
    we_speaker_model.model.eval()
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            utts = batch["path"]
            features = batch["fbank"].float().to(we_speaker_model.device)
            # Forward through model
            outputs = we_speaker_model.model(features)  # embed or (embed_a, embed_b)
            embeds = outputs[-1] if isinstance(outputs, tuple) else outputs
            embeds = embeds.cpu().detach().numpy()

            for i, utt in enumerate(utts):
                embed = embeds[i]
                all_embeddings[utt] = embed

    return all_embeddings

embeddings = evaluate_CAMPPLUS(campplus_model, audio_dataloader_fbank)

Evaluating: 100%|██████████| 153/153 [00:27<00:00,  5.59it/s]


In [24]:
def map_embeddings_to_df(df, embeddings):
    embeddings_df = pd.DataFrame(
        list(embeddings.items()), columns=["path", "embedding"]
    )
    df_with_embeddings = df.merge(embeddings_df, on="path")
    return df_with_embeddings

In [None]:
merged_df = map_embeddings_to_df(df, embeddings)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4874 entries, 0 to 4873
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   path                4874 non-null   object
 1   person_id           4874 non-null   object
 2   utterance_env       4874 non-null   object
 3   utterance_filename  4874 non-null   object
 4   embedding           4874 non-null   object
dtypes: object(5)
memory usage: 190.5+ KB


In [33]:

merged_df.head()


Unnamed: 0,path,person_id,utterance_env,utterance_filename,embedding
0,../data/vox1_test_wav/id10295/nt7dNRvlEHE/0000...,id10295,nt7dNRvlEHE,00005.wav,"[-1.9477693, 0.87528807, 1.1467675, 1.1967306,..."
1,../data/vox1_test_wav/id10295/nt7dNRvlEHE/0000...,id10295,nt7dNRvlEHE,00004.wav,"[-1.3404751, 1.564712, 1.1248378, 0.6840156, -..."
2,../data/vox1_test_wav/id10295/nt7dNRvlEHE/0000...,id10295,nt7dNRvlEHE,00001.wav,"[-1.6733762, 1.2464287, 0.46165344, 0.7650559,..."
3,../data/vox1_test_wav/id10295/nt7dNRvlEHE/0000...,id10295,nt7dNRvlEHE,00003.wav,"[-1.5344026, 0.9411537, 1.0485039, -0.4902958,..."
4,../data/vox1_test_wav/id10295/nt7dNRvlEHE/0000...,id10295,nt7dNRvlEHE,00002.wav,"[-0.84181863, 0.41680804, 0.8774067, 0.0415427..."


In [34]:

merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4874 entries, 0 to 4873
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   path                4874 non-null   object
 1   person_id           4874 non-null   object
 2   utterance_env       4874 non-null   object
 3   utterance_filename  4874 non-null   object
 4   embedding           4874 non-null   object
dtypes: object(5)
memory usage: 190.5+ KB


In [35]:
numpy_merged_df = merged_df.copy()
# numpy_merged_df["embedding"] = numpy_merged_df["embedding"].apply(np.array)
numpy_merged_df.to_parquet(
    "speaker_verification_data.parquet",
    index=False,
    engine="pyarrow",
    compression="snappy",
)

In [46]:
df_loaded = pd.read_parquet("../embeds/vox1_test_wav/campplus_embeddings.parquet", engine="pyarrow")

In [48]:
df_loaded.head()

Unnamed: 0,path,person_id,utterance_env,utterance_filename,embedding
0,../data/vox1_test_wav/id10295/nt7dNRvlEHE/0000...,id10295,nt7dNRvlEHE,00005.wav,"[-0.66124964, 0.36657766, -1.1138742, 1.030842..."
1,../data/vox1_test_wav/id10295/nt7dNRvlEHE/0000...,id10295,nt7dNRvlEHE,00004.wav,"[-0.20502278, -0.88471156, -2.3427346, 0.82132..."
2,../data/vox1_test_wav/id10295/nt7dNRvlEHE/0000...,id10295,nt7dNRvlEHE,00001.wav,"[-0.058600664, 0.6727131, -0.7362049, 0.832892..."
3,../data/vox1_test_wav/id10295/nt7dNRvlEHE/0000...,id10295,nt7dNRvlEHE,00003.wav,"[-0.14910118, 0.51563984, -0.8659944, 1.116852..."
4,../data/vox1_test_wav/id10295/nt7dNRvlEHE/0000...,id10295,nt7dNRvlEHE,00002.wav,"[-0.52327955, 0.759426, -0.7779301, 1.2519413,..."


In [38]:
#get  embedding of the first row

df_loaded.iloc[0]["embedding"]

array([-1.9477693 ,  0.87528807,  1.1467675 ,  1.1967306 , -0.47338516,
        1.7572893 ,  0.35084888, -1.1540041 , -1.3210968 ,  1.6012275 ,
       -1.0762585 , -0.44248694,  0.218608  , -0.27130088, -0.45462215,
       -0.19865777, -0.26834065,  0.21367282, -0.8436834 ,  0.11194724,
        0.01127281,  0.86356944, -0.22389542,  1.0766221 , -0.43149847,
       -0.24521439, -1.0943152 ,  0.54799664, -2.165308  ,  0.42285034,
        1.5470072 , -0.39065608, -0.21269657,  0.6641775 ,  1.0433041 ,
       -0.6428348 , -0.24538065, -2.0366335 , -0.6788447 , -1.6342717 ,
       -0.26556277,  0.03650101,  0.1763368 ,  0.8649897 , -0.80470115,
       -1.825708  ,  1.4988852 , -1.1387967 ,  1.0510931 , -0.8013163 ,
        1.6196156 , -1.1638414 ,  1.1226951 , -0.5127667 ,  0.02043838,
       -1.0073698 , -1.9062557 ,  1.3125198 , -0.8931751 , -0.00470269,
       -1.3825536 ,  0.02921643,  0.58605844, -2.123169  , -0.60786897,
        0.7188853 ,  1.1662261 , -0.50129324, -0.14032638,  1.25

In [41]:
embeddings_loaded = np.stack(df_loaded["embedding"].values)
embeddings_tensor_loaded = torch.tensor(embeddings_loaded, dtype=torch.float32)

In [43]:
embeddings_tensor_loaded.shape

torch.Size([4874, 192])

In [27]:
merged_df.to_parquet(
    "speaker_verification_data.parquet", index=False, compression="snappy"
)

ValueError: Can't infer object conversion type: 0       [-1.9477693, 0.87528807, 1.1467675, 1.1967306,...
1       [-1.3404751, 1.564712, 1.1248378, 0.6840156, -...
2       [-1.6733762, 1.2464287, 0.46165344, 0.7650559,...
3       [-1.5344026, 0.9411537, 1.0485039, -0.4902958,...
4       [-0.84181863, 0.41680804, 0.8774067, 0.0415427...
                              ...                        
4869    [0.104850784, 0.7097255, 1.407516, -0.96619225...
4870    [-0.39940205, 0.86317617, 1.2550328, -0.467166...
4871    [-0.15146518, 0.29355344, 1.1567106, -0.776961...
4872    [-0.335996, 0.25943008, 1.2008984, -0.7599641,...
4873    [0.073237926, 0.9998041, 0.6793622, -1.05732, ...
Name: embedding, Length: 4874, dtype: object

In [None]:
#put embeddings into dataframe and save it to ./embeds/
df['embedding'] = df['path'].map(embeddings)

In [None]:
# check if embeddings match
df['embedding'][0] == embeddings[df['path'][0]]

In [None]:
# save embeddings to disk

os.makedirs(embeds_dir, exist_ok=True)
csv_name = "campplus_embeddings.csv"
df.to_csv(os.path.join(embeds_dir, csv_name), index=False)


# ECAPA_TDNN

In [None]:
ecapa_model = wespeaker.load_model_local("./models/voxceleb_ECAPA1024")
ecapa_model.set_device("mps")

In [None]:
test_dir = "./wav_files_voxceleb2"
ecapa_df = scan_directory_voxceleb2(test_dir)

# Create an instance of the dataset
# 1 s  = 16_000 samples
max_len = 5 * 16000
audio_dataset_fbank = AudioDatasetFBank(ecapa_df, max_len, ecapa_model)

# Create a DataLoader for the audio_dataset
audio_dataloader_fbank = DataLoader(audio_dataset_fbank, batch_size=32, shuffle=False)


In [None]:
def evaluate_ECAPA(we_speaker_model, dataloader):
    all_embeddings = {}
    we_speaker_model.model.eval()
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            utts = batch["path"]
            features = batch["fbank"].float().to(we_speaker_model.device)
            # Forward through model
            outputs = we_speaker_model.model(features)  # embed or (embed_a, embed_b)
            embeds = outputs[-1] if isinstance(outputs, tuple) else outputs
            embeds = embeds.cpu().detach().numpy()

            for i, utt in enumerate(utts):
                embed = embeds[i]
                all_embeddings[utt] = embed

    return all_embeddings


embeddings = evaluate_ECAPA(ecapa_model, audio_dataloader_fbank)

In [None]:
ecapa_df['embedding'] = ecapa_df['path'].map(embeddings)

# check if embeddings match
ecapa_df['embedding'][0] == embeddings[ecapa_df['path'][0]]

# save embeddings to disk
csv_name = "ecapa_embeddings.csv"
ecapa_df.to_csv(os.path.join(embeds_dir, csv_name), index=False)


# ResNet34

In [None]:
resnet34_model = wespeaker.load_model_local("./models/cnceleb_resnet34")
resnet34_model.set_device("mps")

In [None]:
max_len = 5 * 16000
audio_dataset_fbank = AudioDatasetFBank(df, max_len, resnet34_model)
audio_dataloader_fbank = DataLoader(audio_dataset_fbank, batch_size=32, shuffle=False)

In [None]:
def evaluate_RESNET34(we_speaker_model, dataloader):
    all_embeddings = {}
    we_speaker_model.model.eval()
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            utts = batch["path"]
            features = batch["fbank"].float().to(we_speaker_model.device)
            # Forward through model
            outputs = we_speaker_model.model(features)  # embed or (embed_a, embed_b)
            embeds = outputs[-1] if isinstance(outputs, tuple) else outputs
            embeds = embeds.cpu().detach().numpy()

            for i, utt in enumerate(utts):
                embed = embeds[i]
                all_embeddings[utt] = embed

    return all_embeddings


embeddings = evaluate_RESNET34(resnet34_model, audio_dataloader_fbank)

In [None]:
# print how many parameters in the model
print(f"Number of parameters in the model: {sum(p.numel() for p in resnet34_model.model.parameters())}")

# REDIMNET

In [None]:
from huggingface_hub import hf_hub_download

# Ustawienia
repo_id = "Jenthe/ECAPA2"
filename = "ecapa2.pt"
cache_dir = "../models/ReDimNet"  # Określ lokalizację

# Pobierz model
model_file = hf_hub_download(repo_id=repo_id, filename=filename, cache_dir=cache_dir)

In [None]:
import torch
import os
# model_name = "b2"  # ~b3-b4 size
# train_type = "ptn"
# dataset = "vox2"

# redim = torch.hub.load(
#     "IDRnD/ReDimNet",
#     "ReDimNet",
#     model_name=model_name,
#     train_type=train_type,
#     dataset=dataset,
#     source="github",
# )

# cache_dir = "../models/ReDimNet"
# os.makedirs(cache_dir, exist_ok=True)

# torch.save(redim.state_dict(), os.path.join(cache_dir, "redim_model.pt"))

In [None]:
import sys

sys.path.append("../helper_libs")
from redimnet.model import ReDimNetWrap

path = "../models/ReDimNet/b6-vox2-ptn.pt"
full_state_dict = torch.load(path)
model_config = full_state_dict["model_config"]
state_dict = full_state_dict["state_dict"]

# Create an instance of the model using the configuration
redimnet_model = ReDimNetWrap(**model_config)

# Load the state dictionary into the model
redimnet_model.load_state_dict(state_dict)

# Move the model to the desired device (e.g., 'mps' or 'cpu')
redimnet_model.to("mps")

In [None]:


# Extract the model configuration and state dictionary from full_state_dict
model_config = full_state_dict['model_config']
state_dict = full_state_dict['state_dict']

# Create an instance of the model using the configuration
model = ReDimNetWrap(**model_config)

# Load the state dictionary into the model
model.load_state_dict(state_dict)

# Move the model to the desired device (e.g., 'mps' or 'cpu')
model.to('mps')

# Verify the model is loaded correctly
print(model)

In [None]:
import torch
path = "../models/ReDimNet/redim_model.pt"
# Load the model from the local file
model = torch.hub.load(
	"IDRnD/ReDimNet",
	"ReDimNet",
	model_name="b2",  # ~b3-b4 size
	train_type="ptn",
	dataset="vox2",
	source="github",
)
model.load_state_dict(torch.load(path))

In [None]:
type(redim)

In [None]:
redim.to('mps')

In [None]:
device = next(redim.parameters()).device
print(f"The model is loaded on: {device}")

In [None]:
def extract_redim_embeddings(model, dataloader):
    model.eval()
    embeddings = []
    for batch in tqdm(dataloader, desc="Extracting ReDimNet Embeddings"):
        paths = batch['path']
        waveforms = batch['waveform'].float().to(model.device)
        with torch.no_grad():
            batch_embeddings = model.forward(waveforms).cpu().numpy()
        for path, embedding in zip(paths, batch_embeddings):
            embeddings.append((path, embedding))
    return embeddings

# Example usage
redim_embeddings = extract_redim_embeddings(redim, audio_dataloader)
df['redim_embedding'] = [embedding for _, embedding in redim_embeddings]
df

In [None]:
import torchaudio

def process_audio_and_extract_embeddings(model, df):
    embeddings = []
    model.eval()
    for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing Audio"):
        audio_path = row['path']
        waveform, sample_rate = torchaudio.load(audio_path)
        
        with torch.no_grad():
            embedding = model.forward(waveform).cpu().numpy()
        
        embeddings.append(embedding)
    
    df['embedding'] = embeddings
    return df

# Example usage
df = process_audio_and_extract_embeddings(redim, df)
df

In [None]:
df.to_csv('redim_subsample.csv', index=False)

In [None]:
max_len = 4 * 16_000
audio_dataset = AudioDataset(df, max_len)
audio_dataloader = audio_dataloader = DataLoader(
    audio_dataset, batch_size=32, shuffle=False
)

In [None]:
def evaluate_REDIMNET(model, dataloader):
    all_embeddings = {}
    model.eval()
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating REDIMNET"):
            utts = batch["path"]
            features = batch["waveform"].float().to('mps')
            # Forward through model
            embeds = model.forward(features).cpu().numpy()

            for i, utt in enumerate(utts):
                embed = embeds[i]
                all_embeddings[utt] = embed

    return all_embeddings

redim.to('mps')
embeddings = evaluate_REDIMNET(redim, audio_dataloader)

# ECAPA2

In [None]:
from huggingface_hub import hf_hub_download

# automatically checks for cached file, optionally set `cache_dir` location
model_file = hf_hub_download(repo_id='Jenthe/ECAPA2', filename='ecapa2.pt', cache_dir="../models/ECAPA2")


In [None]:
ecapa2 = torch.jit.load(model_file, map_location="cpu")

In [None]:
device = next(ecapa2.parameters()).device
print(f"The ECAPA2 model is loaded on: {device}")

In [None]:
ecapa2.to('mps')

In [None]:
test_dir = "./wav_files_voxceleb2"
ecapa2_df = scan_directory_voxceleb2(test_dir)

In [None]:
max_len = 5 * 16_000
audio_dataset = AudioDataset(ecapa2_df, max_len)
audio_dataloader = audio_dataloader = DataLoader(
    audio_dataset, batch_size=32, shuffle=False
)

In [None]:
def evaluate_ECAPA2(model, dataloader):
    all_embeddings = {}
    model.eval()
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating ECAPA2"):
            utts = batch["path"]
            features = batch["waveform"].float().to("mps")
            # Forward through model
            embeds = model.forward(features).cpu().numpy()

            for i, utt in enumerate(utts):
                embed = embeds[i]
                all_embeddings[utt] = embed

    return all_embeddings


embeddings_ecapa2 = evaluate_ECAPA2(ecapa2, audio_dataloader)

In [None]:
ecapa2_df['embedding'] = ecapa2_df['path'].map(embeddings_ecapa2)

# check if embeddings match
ecapa2_df['embedding'][0] == embeddings_ecapa2[ecapa2_df['path'][0]]

# save embeddings to disk
csv_name = "ecapa2_embeddings.csv"
ecapa2_df.to_csv(os.path.join(embeds_dir, csv_name), index=False)


In [None]:
from torch.utils.data import Dataset


class VariableLengthDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [None]:
def collate_fn(batch):
    paths = [item["path"] for item in batch]
    waveforms = [item["waveform"] for item in batch]
    return {"path": paths, "waveform": waveforms}

In [None]:
from torch.utils.data import DataLoader
import torch
# Example data
data = [
    {'path': 'utt1', 'waveform': torch.randn(16000)},
    {'path': 'utt2', 'waveform': torch.randn(32000)},
    {'path': 'utt3', 'waveform': torch.randn(56000)},
]

dataset = VariableLengthDataset(data)
dataloader = DataLoader(dataset, batch_size=2, collate_fn=collate_fn)

In [None]:
import torch
from tqdm import tqdm


def evaluate_torch_model_various(model, dataloader, device):
    all_embeddings = {}
    model.eval()
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            utts = batch["path"]
            waveforms = batch["waveform"]

            for utt, waveform in zip(utts, waveforms):
                waveform = waveform.float().to(device).unsqueeze(0)# Add batch dimension
                embed = model.forward(waveform).cpu().numpy().squeeze(0)
# Remove batch dimension
                all_embeddings[utt] = embed

    return all_embeddings

In [None]:
evaluated_embeddings = evaluate_torch_model_various(redimnet_model, dataloader, "mps")

In [None]:
class AudioDatasetVarious(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        audio_path = self.dataframe.iloc[idx]["path"]
        waveform, sample_rate = torchaudio.load(audio_path)

        sample = {"path": audio_path, "waveform": waveform, "sample_rate": sample_rate}

        return sample

In [None]:
def collate_fn(batch):
    paths = [item["path"] for item in batch]
    waveforms = [item["waveform"] for item in batch]
    return {"path": paths, "waveform": waveforms}

In [None]:
dataset = AudioDatasetVarious(df)
dataloader = DataLoader(dataset, batch_size=32, collate_fn=collate_fn)

In [None]:
evaluated_embeddings = evaluate_torch_model_various(redimnet_model, dataloader, "mps")

Windowed dataset to parquet format

In [2]:

os.chdir("..")


In [15]:
from torch.utils.data import Dataset

In [16]:
class AudioDatasetFBank(Dataset):

    def __init__(self, dataframe, max_len, model):
        self.dataframe = dataframe
        self.max_len = max_len
        self.model = model

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        audio_path = self.dataframe.iloc[idx]["path"]
        waveform, sample_rate = torchaudio.load(audio_path)

        # Extract fbank features
        fbank = self.model.compute_fbank(waveform)

        sample = {"path": audio_path, "waveform": fbank, "sample_rate": sample_rate}

        return sample


# Create an instance of the dataset
# 1 s  = 16_000 samples


In [17]:
def evaluate_CAMPPLUS(we_speaker_model, dataloader):
    all_embeddings = {}
    we_speaker_model.model.eval()
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            utts = batch["path"]
            features = batch["waveform"].float().to(we_speaker_model.device)
            # Forward through model
            outputs = we_speaker_model.model(features)  # embed or (embed_a, embed_b)
            embeds = outputs[-1] if isinstance(outputs, tuple) else outputs
            embeds = embeds.cpu().detach().numpy()

            for i, utt in enumerate(utts):
                embed = embeds[i]
                all_embeddings[utt] = embed

    return all_embeddings

In [21]:
test_dir = "data/voxceleb2_eval_segments"
df = scan_directory_voxceleb2(test_dir)


campplus_model = wespeaker.load_model("campplus")
campplus_model.set_device("mps")

max_len = 4 * 16000
audio_dataset_fbank = AudioDatasetFBank(df, max_len, campplus_model)

# Create a DataLoader for the audio_dataset
audio_dataloader_fbank = DataLoader(audio_dataset_fbank, batch_size=32, shuffle=False)

embeddings = evaluate_CAMPPLUS(campplus_model, audio_dataloader_fbank)

Evaluating: 100%|██████████| 347/347 [00:53<00:00,  6.49it/s]


In [None]:
# split file utteaerance_filename by "_" to get video_id, and frame_id which is a third element
df['video_id'] = df['utterance_filename'].apply(lambda x: x.split("_")[0])
df['frame_id'] = df['utterance_filename'].apply(lambda x: x.split("_")[2].replace(".wav", ""))
df.drop(columns=['utterance_filename'], inplace=True)

In [30]:
df

Unnamed: 0,path,person_id,video_id,frame_id
0,data/voxceleb2_eval_segments/id02019/87_seg_14...,id02019,87,14
1,data/voxceleb2_eval_segments/id02019/115_seg_1...,id02019,115,1
2,data/voxceleb2_eval_segments/id02019/96_seg_0.wav,id02019,96,0
3,data/voxceleb2_eval_segments/id02019/89_seg_4.wav,id02019,89,4
4,data/voxceleb2_eval_segments/id02019/1_seg_13.wav,id02019,1,13
...,...,...,...,...
11091,data/voxceleb2_eval_segments/id03347/313_seg_1...,id03347,313,16
11092,data/voxceleb2_eval_segments/id03347/34_seg_2.wav,id03347,34,2
11093,data/voxceleb2_eval_segments/id03347/381_seg_3...,id03347,381,3
11094,data/voxceleb2_eval_segments/id03347/259_seg_2...,id03347,259,22


In [31]:

from src.utils import map_embeddings_to_df

df = map_embeddings_to_df(df, embeddings)

In [33]:
df.drop(columns=['path'], inplace=True)

In [34]:
df

Unnamed: 0,person_id,video_id,frame_id,embedding
0,id02019,87,14,"[-0.7572477, -0.28664908, 1.2824339, -0.092000..."
1,id02019,115,1,"[0.8128398, 1.931974, 1.5611752, 0.6306397, -0..."
2,id02019,96,0,"[0.3887814, 0.728199, 0.86692244, -0.42631707,..."
3,id02019,89,4,"[-0.75237453, -0.14510478, 0.0634555, -0.42303..."
4,id02019,1,13,"[-1.2547182, -0.08362903, 0.39995107, -0.02934..."
...,...,...,...,...
11091,id03347,313,16,"[-0.40491396, 0.82559365, -0.06662895, -0.9083..."
11092,id03347,34,2,"[-0.5628453, -1.0667058, -0.295191, 1.0603111,..."
11093,id03347,381,3,"[-0.12918745, -0.61040545, -0.3202267, -0.2740..."
11094,id03347,259,22,"[-0.38162476, 0.37319005, 0.9816289, -0.403025..."


In [23]:
from src.utils import save_embeddings_to_parquet

os.makedirs("embeds/voxceleb2_segments", exist_ok=True)
save_embeddings_to_parquet(df, "embeds/voxceleb2_segments/campplus_embeds")

In [6]:
from src.utils import read_embeddings_from_parquet


audio_df = read_embeddings_from_parquet("embeds/voxceleb2_segments/campplus_embeds")
video_df = read_embeddings_from_parquet("embeds/facenet_eval")

In [27]:
video_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8941 entries, 0 to 8940
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   video_id   8941 non-null   object 
 1   person_id  8941 non-null   object 
 2   embedding  8449 non-null   object 
 3   frame_id   8449 non-null   float64
dtypes: float64(1), object(3)
memory usage: 279.5+ KB


In [11]:


audio_df

Unnamed: 0,path,person_id,utterance_filename,embedding
0,../data/voxceleb2_eval_segments/id02019/50_seg...,id02019,50_seg3_.wav,"[0.4952665, 0.22981328, 0.81912225, 0.6312957,..."
1,../data/voxceleb2_eval_segments/id02019/194_se...,id02019,194_seg0_.wav,"[0.061572704, 0.14836346, 1.0915542, -0.358542..."
2,../data/voxceleb2_eval_segments/id02019/196_se...,id02019,196_seg8_.wav,"[0.14852108, -0.22755864, 0.08376154, 0.225873..."
3,../data/voxceleb2_eval_segments/id02019/87_seg...,id02019,87_seg8_.wav,"[-0.010013767, 0.37227827, 0.76968944, -0.4603..."
4,../data/voxceleb2_eval_segments/id02019/1_seg9...,id02019,1_seg9_.wav,"[-1.1399962, 1.3456193, -0.090061635, 0.067406..."
...,...,...,...,...
11091,../data/voxceleb2_eval_segments/id03347/268_se...,id03347,268_seg8_.wav,"[0.106771335, -0.60677135, 0.6916544, 0.571006..."
11092,../data/voxceleb2_eval_segments/id03347/323_se...,id03347,323_seg4_.wav,"[-1.2123433, 1.0951328, 0.07456491, -0.0196330..."
11093,../data/voxceleb2_eval_segments/id03347/313_se...,id03347,313_seg17_.wav,"[-0.23038934, 0.4561194, 0.17479698, -0.952375..."
11094,../data/voxceleb2_eval_segments/id03347/259_se...,id03347,259_seg15_.wav,"[0.0460768, -0.34532627, 0.29578942, -0.645461..."
