In [4]:
!pip install faiss-gpu-cu11==1.10.0

Collecting faiss-gpu-cu11==1.10.0
  Downloading faiss_gpu_cu11-1.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting numpy<2 (from faiss-gpu-cu11==1.10.0)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-runtime-cu11>=11.8.89 (from faiss-gpu-cu11==1.10.0)
  Downloading nvidia_cuda_runtime_cu11-11.8.89-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cublas-cu11>=11.11.3.6 (from faiss-gpu-cu11==1.10.0)
  Downloading nvidia_cublas_cu11-11.11.3.6-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Downloading faiss_gpu_cu11-1.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (47.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.8/47.8 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nu

In [1]:
import logging
from config import Config
from dataset import AudioDataset
import argparse
import os
from pipeline import DeepfakeDetectionPipeline
import torch

def main():
    """Run the complete audio deepfake detection pipeline."""
    # Configure logging
    logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    )

    # 2. Parse command-line arguments
    # parser = argparse.ArgumentParser(description="Audio Deepfake Detection")
    # parser.add_argument("--data_fraction", type=float, default=1.0,
    #                     help="Fraction of data to use (e.g., 0.25 for 25%)")
    # parser.add_argument("--mode", type=str,
    #                     choices=["train", "evaluate", "predict"], required=True,
    #                     help="Operation mode: train, evaluate, or predict")
    # parser.add_argument("--model_prefix", type=str, default="final_model",
    #                     help="Prefix for saved model files")
    # parser.add_argument("--audio_path", type=str,
    #                     help="Path to audio file for prediction (required for predict mode)")
    # parser.add_argument("--device", type=str, default="cuda:0",
    #                     help="Torch device for computation (e.g. cuda:0)")
    # args = parser.parse_args()

    data_fraction = 0.5
    mode = "train"
    model_prefix = "final_model"
    # args.audio_path = "/content/release_in_the_wild/1008.wav"

    # 3. Disable problematic torchaudio backends
    os.environ["TORCHAUDIO_USE_SOX"] = "0"
    os.environ["TORCHAUDIO_USE_BACKEND_DISPATCHER"] = "1"

    # 4. Set device
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    torch.cuda.set_device(device)

    # 5. Create configuration
    config = Config()
    config.device = device
    config.data_fraction = 0.5
    config.train_split = 0.8

    # 6. Choose appropriate DataLoader settings
    config.num_workers = max(1, torch.cuda.device_count() * 2)
    config.train_batch_size = getattr(config, "train_batch_size", 128)
    config.eval_batch_size = getattr(config, "eval_batch_size", 128)
    config.db_batch_size = getattr(config, "db_batch_size", 64)
    config.top_k = getattr(config, "top_k", 5)
    config.use_batch_norm = False
    config.use_layer_norm = True

    # 7. Initialize pipeline (moves all models to GPU)
    pipeline = DeepfakeDetectionPipeline(config)

    if mode == "train":
        # 8. Instantiate datasets once with split flag
        train_dataset = AudioDataset(config, is_train=True, split_data=True)
        val_dataset   = AudioDataset(config, is_train=False, split_data=True)

        # 9. Train with mixed-precision and GPU batching
        pipeline.train(train_dataset, val_dataset)

    elif mode == "evaluate":
        # 10. Load best model onto GPU
        pipeline.load_models(model_prefix)
        pipeline.vector_db.load()

        test_dataset = AudioDataset(config, is_train=False, split_data=False)
        metrics = pipeline.evaluate_with_metrics(test_dataset)

        print("Evaluation metrics:")
        for key, value in metrics.items():
            print(f"{key}: {value}")

    elif mode == "predict":
        if not audio_path:
            raise ValueError("Audio path must be provided for predict mode")

        # 11. Load model & DB on GPU
        pipeline.load_models(model_prefix)
        pipeline.vector_db.load()

        # 12. Single-file prediction on GPU
        result = pipeline.predict(audio_path)
        logging.info(f"Prediction  : {result['prediction']}")
        logging.info(f"Probability : {result['probability']:.4f}")
        logging.info(f"Retrieved   : {result['retrieved_labels']}")

main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Feature dimension set to: 768


Vector DB Build: 100%|██████████| 199/199 [06:01<00:00,  1.82s/it]
Epoch 1/5: 100%|██████████| 100/100 [05:46<00:00,  3.47s/it]
Evaluating: 100%|██████████| 25/25 [01:33<00:00,  3.74s/it]


Epoch 1: Train Loss: 0.8185, Train Acc: 0.6178, Val Loss: 0.7604, Val Acc:0.7193


Epoch 2/5: 100%|██████████| 100/100 [05:42<00:00,  3.42s/it]
Evaluating: 100%|██████████| 25/25 [01:24<00:00,  3.38s/it]


Epoch 2: Train Loss: 0.7362, Train Acc: 0.6954, Val Loss: 0.6800, Val Acc:0.7344


Epoch 3/5: 100%|██████████| 100/100 [05:37<00:00,  3.38s/it]
Evaluating: 100%|██████████| 25/25 [01:24<00:00,  3.37s/it]


Epoch 3: Train Loss: 0.6925, Train Acc: 0.7224, Val Loss: 0.6470, Val Acc:0.7395


Epoch 4/5: 100%|██████████| 100/100 [05:36<00:00,  3.37s/it]
Evaluating: 100%|██████████| 25/25 [01:24<00:00,  3.37s/it]


Epoch 4: Train Loss: 0.6856, Train Acc: 0.7255, Val Loss: 0.6303, Val Acc:0.7520


Epoch 5/5: 100%|██████████| 100/100 [05:36<00:00,  3.36s/it]
Evaluating: 100%|██████████| 25/25 [01:23<00:00,  3.36s/it]

Epoch 5: Train Loss: 0.6342, Train Acc: 0.7563, Val Loss: 0.6258, Val Acc:0.7819



