In [None]:
!pip install transformers torchaudio librosa soundfile xgboost

Collecting xgboost==1.7.6
  Downloading xgboost-1.7.6-py3-none-manylinux2014_x86_64.whl.metadata (1.9 kB)
Downloading xgboost-1.7.6-py3-none-manylinux2014_x86_64.whl (200.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.3/200.3 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 3.1.2
    Uninstalling xgboost-3.1.2:
      Successfully uninstalled xgboost-3.1.2
Successfully installed xgboost-1.7.6


In [None]:
import os
import librosa
import numpy as np
import torch
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle
from tqdm import tqdm

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

processor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-xls-r-300m")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-xls-r-300m")
model.to(device)

Using device: cuda


pytorch_model.bin:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

Wav2Vec2Model(
  (feature_extractor): Wav2Vec2FeatureEncoder(
    (conv_layers): ModuleList(
      (0): Wav2Vec2LayerNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation): GELUActivation()
      )
      (1-4): 4 x Wav2Vec2LayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation): GELUActivation()
      )
      (5-6): 2 x Wav2Vec2LayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): Wav2Vec2FeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=1024, bias=True)
    (dropout)

In [None]:
def extract_embedding(audio_path, target_sr=16000):
    audio, sr = librosa.load(audio_path, sr=target_sr)
    audio = torch.tensor(audio).float()
    inputs = processor(audio, sampling_rate=target_sr, return_tensors="pt", padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        features = model(**inputs).last_hidden_state

    return features.mean(dim=1).cpu().numpy().flatten()

In [None]:
def load_split(split_path, cache_embeddings=True, cache_file=None):
    if cache_embeddings and cache_file and os.path.exists(cache_file):
        print(f"Loading cached embeddings from {cache_file}...")
        data = np.load(cache_file, allow_pickle=True)
        return data['X'], data['y']

    X, y = [], []
    for label, subfolder in enumerate(["real", "fake"]):
        folder = os.path.join(split_path, subfolder)
        print(f"Loading {subfolder} voices from: {folder}")
        for file in tqdm(os.listdir(folder)):
            if file.endswith(".wav"):
                emb = extract_embedding(os.path.join(folder, file))
                X.append(emb)
                y.append(label)

    X = np.array(X)
    y = np.array(y)

    if cache_embeddings and cache_file:
        np.savez_compressed(cache_file, X=X, y=y)
        print(f"Saved embeddings to {cache_file}")

    return X, y