### Loader to load and Test all models.

#### Setup & Wav2Vec2 Embeddings

In [40]:
import torch
import numpy as np
import soundfile as sf
import librosa
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load Wav2Vec2
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
    "facebook/wav2vec2-xls-r-300m"
)
wav2vec_model = Wav2Vec2Model.from_pretrained(
    "facebook/wav2vec2-xls-r-300m"
).to(device)
wav2vec_model.eval()

def extract_embedding(file_path):
    audio, sr = sf.read(file_path)
    if audio.ndim > 1:
        audio = audio.mean(axis=1)

    # Force 16kHz
    if sr != 16000:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
        sr = 16000

    inputs = feature_extractor(
        audio,
        sampling_rate=sr,
        return_tensors="pt",
        padding=True
    )

    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = wav2vec_model(**inputs)

    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

#### Neural Network Inference

In [39]:
import torch.nn as nn

class VoiceNN(torch.nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.model = torch.nn.Sequential(
            torch.nn.Linear(input_size, 512),
            torch.nn.ReLU(),
            torch.nn.Linear(512, 128),
            torch.nn.ReLU(),
            torch.nn.Linear(128, 1),
            torch.nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)


def load_nn_model(path, input_size):
    model = VoiceNN(input_size)
    model.load_state_dict(torch.load(path, map_location="cpu"))
    model.eval()
    return model

def predict_nn(wav_file, model_path):
    emb = extract_embedding(wav_file)
    model = load_nn_model(model_path, emb.shape[1])
    prob = model(torch.tensor(emb, dtype=torch.float32)).item()
    return prob


#### XGBoost & LightGBM Inference

In [38]:
import pickle
import lightgbm as lgb

def predict_xgb(wav_file, model_path):
    emb = extract_embedding(wav_file)
    with open(model_path, "rb") as f:
        model = pickle.load(f)
    return model.predict(emb)[0]

def predict_lgb(wav_file, model_path):
    emb = extract_embedding(wav_file)
    model = lgb.Booster(model_file=model_path)
    return int(model.predict(emb)[0] > 0.5)


#### Let us Test them

In [None]:
#wav_file = "./examples/f.wav"  -- is a Fake - detected by XGBoost and LightGBM
#wav_file = "./examples/example2.wav" -- Fake as well only detected by LightGBM
wav_file = "./examples/example3.wav"

nn_prob = predict_nn(wav_file, "./Models/voice_nn_model.pth")
xgb_pred = predict_xgb(wav_file, "./Models/voice_classifier.pkl")
lgb_pred = predict_lgb(wav_file, "./Models/lgb_voice_model.lgb")

def label(x):
    return "REAL" if x >= 0.5 else "FAKE"

print("Voice Classification Results\n")
print(f"Neural Network : {label(nn_prob)}  (confidence={nn_prob:.3f})")
print(f"XGBoost        : {label(xgb_pred)}")
print(f"LightGBM       : {label(lgb_pred)}")


Voice Classification Results

Neural Network : REAL  (confidence=0.846)
XGBoost        : FAKE
LightGBM       : FAKE
