In [None]:
!pip install transformers torchaudio librosa soundfile xgboost

Collecting xgboost==1.7.6
  Downloading xgboost-1.7.6-py3-none-manylinux2014_x86_64.whl.metadata (1.9 kB)
Downloading xgboost-1.7.6-py3-none-manylinux2014_x86_64.whl (200.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.3/200.3 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 3.1.2
    Uninstalling xgboost-3.1.2:
      Successfully uninstalled xgboost-3.1.2
Successfully installed xgboost-1.7.6


In [None]:
import os
import librosa
import numpy as np
import torch
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle
from tqdm import tqdm

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

processor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-xls-r-300m")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-xls-r-300m")
model.to(device)

Using device: cuda


pytorch_model.bin:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

Wav2Vec2Model(
  (feature_extractor): Wav2Vec2FeatureEncoder(
    (conv_layers): ModuleList(
      (0): Wav2Vec2LayerNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation): GELUActivation()
      )
      (1-4): 4 x Wav2Vec2LayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation): GELUActivation()
      )
      (5-6): 2 x Wav2Vec2LayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): Wav2Vec2FeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=1024, bias=True)
    (dropout)

In [None]:
def extract_embedding(audio_path, target_sr=16000):
    audio, sr = librosa.load(audio_path, sr=target_sr)
    audio = torch.tensor(audio).float()
    inputs = processor(audio, sampling_rate=target_sr, return_tensors="pt", padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        features = model(**inputs).last_hidden_state

    return features.mean(dim=1).cpu().numpy().flatten()

In [None]:
def load_split(split_path, cache_embeddings=True, cache_file=None):
    if cache_embeddings and cache_file and os.path.exists(cache_file):
        print(f"Loading cached embeddings from {cache_file}...")
        data = np.load(cache_file, allow_pickle=True)
        return data['X'], data['y']

    X, y = [], []
    for label, subfolder in enumerate(["real", "fake"]):
        folder = os.path.join(split_path, subfolder)
        print(f"Loading {subfolder} voices from: {folder}")
        for file in tqdm(os.listdir(folder)):
            if file.endswith(".wav"):
                emb = extract_embedding(os.path.join(folder, file))
                X.append(emb)
                y.append(label)

    X = np.array(X)
    y = np.array(y)

    if cache_embeddings and cache_file:
        np.savez_compressed(cache_file, X=X, y=y)
        print(f"Saved embeddings to {cache_file}")

    return X, y

In [None]:
DATASET_ROOT = "/content/drive/MyDrive/dataset/voices/"

X_train, y_train = load_split(os.path.join(DATASET_ROOT, "training"), cache_file="train_embeddings.npz")
X_val, y_val     = load_split(os.path.join(DATASET_ROOT, "validation"), cache_file="val_embeddings.npz")
X_test, y_test   = load_split(os.path.join(DATASET_ROOT, "testing"), cache_file="test_embeddings.npz")

print("Dataset sizes:")
print(" Train:", X_train.shape)
print(" Val:  ", X_val.shape)
print(" Test: ", X_test.shape)

Loading real voices from: /content/drive/MyDrive/dataset/voices/training/real


100%|██████████| 6978/6978 [07:14<00:00, 16.06it/s]


Loading fake voices from: /content/drive/MyDrive/dataset/voices/training/fake


100%|██████████| 6978/6978 [07:06<00:00, 16.35it/s]


Saved embeddings to train_embeddings.npz
Loading real voices from: /content/drive/MyDrive/dataset/voices/validation/real


100%|██████████| 1413/1413 [01:17<00:00, 18.19it/s]


Loading fake voices from: /content/drive/MyDrive/dataset/voices/validation/fake


100%|██████████| 1413/1413 [01:18<00:00, 17.95it/s]


Saved embeddings to val_embeddings.npz
Loading real voices from: /content/drive/MyDrive/dataset/voices/testing/real


100%|██████████| 544/544 [00:28<00:00, 19.36it/s]


Loading fake voices from: /content/drive/MyDrive/dataset/voices/testing/fake


100%|██████████| 544/544 [00:27<00:00, 19.76it/s]


Saved embeddings to test_embeddings.npz
Dataset sizes:
 Train: (13956, 1024)
 Val:   (2826, 1024)
 Test:  (1088, 1024)


In [None]:
model_xgb = XGBClassifier(
    n_estimators=1000,
    learning_rate=0.03,
    max_depth=7,
    subsample=0.9,
    colsample_bytree=0.9,
    eval_metric="logloss",
    tree_method="gpu_hist",
    predictor="gpu_predictor"
)

model_xgb.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    early_stopping_rounds=30,
    verbose=True
)



[0]	validation_0-logloss:0.67271
[1]	validation_0-logloss:0.65367
[2]	validation_0-logloss:0.63478
[3]	validation_0-logloss:0.61760
[4]	validation_0-logloss:0.60119
[5]	validation_0-logloss:0.58516
[6]	validation_0-logloss:0.56982
[7]	validation_0-logloss:0.55556
[8]	validation_0-logloss:0.54138
[9]	validation_0-logloss:0.52759
[10]	validation_0-logloss:0.51478
[11]	validation_0-logloss:0.50238
[12]	validation_0-logloss:0.49061
[13]	validation_0-logloss:0.47907
[14]	validation_0-logloss:0.46820
[15]	validation_0-logloss:0.45734
[16]	validation_0-logloss:0.44673
[17]	validation_0-logloss:0.43714
[18]	validation_0-logloss:0.42787
[19]	validation_0-logloss:0.41880
[20]	validation_0-logloss:0.41004
[21]	validation_0-logloss:0.40124
[22]	validation_0-logloss:0.39265
[23]	validation_0-logloss:0.38484
[24]	validation_0-logloss:0.37708
[25]	validation_0-logloss:0.36952
[26]	validation_0-logloss:0.36224
[27]	validation_0-logloss:0.35524
[28]	validation_0-logloss:0.34825
[29]	validation_0-loglos

AttributeError: 'super' object has no attribute '__sklearn_tags__'

AttributeError: 'super' object has no attribute '__sklearn_tags__'

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.9, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=None, gpu_id=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.03, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=7,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=None,
              num_parallel_tree=None, predictor='gpu_predictor',
              random_state=None, ...)

In [None]:
y_pred = model_xgb.predict(X_test)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9577205882352942

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.98      0.96       544
           1       0.98      0.93      0.96       544

    accuracy                           0.96      1088
   macro avg       0.96      0.96      0.96      1088
weighted avg       0.96      0.96      0.96      1088



In [None]:
with open("voice_classifier.pkl", "wb") as f:
    pickle.dump(model_xgb, f)
print("XGBoost model saved as voice_classifier.pkl")

XGBoost model saved as voice_classifier.pkl


In [None]:
def predict_voice(audio_path):
    emb = extract_embedding(audio_path).reshape(1, -1)
    with open("voice_classifier.pkl", "rb") as f:
        saved_model = pickle.load(f)
    pred = saved_model.predict(emb)[0]
    return "REAL" if pred == 0 else "FAKE"



print(predict_voice("sample.wav")) # The voice is real


REAL
