### Loader to load and Test all models.

#### Setup & Wav2Vec2 Embeddings

In [40]:
import torch
import numpy as np
import soundfile as sf
import librosa
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load Wav2Vec2
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
    "facebook/wav2vec2-xls-r-300m"
)
wav2vec_model = Wav2Vec2Model.from_pretrained(
    "facebook/wav2vec2-xls-r-300m"
).to(device)
wav2vec_model.eval()

def extract_embedding(file_path):
    audio, sr = sf.read(file_path)
    if audio.ndim > 1:
        audio = audio.mean(axis=1)

    # Force 16kHz
    if sr != 16000:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
        sr = 16000

    inputs = feature_extractor(
        audio,
        sampling_rate=sr,
        return_tensors="pt",
        padding=True
    )

    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = wav2vec_model(**inputs)

    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()