In [1]:
%pip install transformers torchaudio librosa soundfile


Collecting librosa
  Downloading librosa-0.11.0-py3-none-any.whl.metadata (8.7 kB)
Collecting soundfile
  Downloading soundfile-0.13.1-py2.py3-none-macosx_11_0_arm64.whl.metadata (16 kB)
Collecting audioread>=2.1.9 (from librosa)
  Downloading audioread-3.0.1-py3-none-any.whl.metadata (8.4 kB)
Collecting numba>=0.51.0 (from librosa)
  Downloading numba-0.61.2-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.8 kB)
Collecting pooch>=1.1 (from librosa)
  Downloading pooch-1.8.2-py3-none-any.whl.metadata (10 kB)
Collecting soxr>=0.3.2 (from librosa)
  Downloading soxr-0.5.0.post1-cp311-cp311-macosx_11_0_arm64.whl.metadata (5.6 kB)
Collecting lazy_loader>=0.1 (from librosa)
  Downloading lazy_loader-0.4-py3-none-any.whl.metadata (7.6 kB)
Collecting msgpack>=1.0 (from librosa)
  Downloading msgpack-1.1.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (8.4 kB)
Collecting llvmlite<0.45,>=0.44.0dev0 (from numba>=0.51.0->librosa)
  Downloading llvmlite-0.44.0-cp311-cp311-macosx_11_0_arm64.whl.metadata 

In [2]:
# Step 1: Import required libraries
from transformers import AutoProcessor, AutoModelForAudioClassification
import torch
import librosa
import os

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Step 2: Load the pre-trained model and processor from Hugging Face
model_name = "MelodyMachine/Deepfake-audio-detection-V2"

# Load processor: responsible for preparing the raw audio for the model
processor = AutoProcessor.from_pretrained(model_name)

# Load model: pre-trained for detecting deepfake (FAKE vs REAL) in audio
model = AutoModelForAudioClassification.from_pretrained(model_name)



OSError: Can't load tokenizer for 'MelodyMachine/Deepfake-audio-detection-V2'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'MelodyMachine/Deepfake-audio-detection-V2' is the correct path to a directory containing all relevant files for a Wav2Vec2CTCTokenizer tokenizer.

In [None]:
# Step 3: Function to predict if audio is FAKE or REAL
def detect_audio_fake(audio_path):
    # Load audio file using librosa (resample to 16kHz as required by model)
    waveform, sample_rate = librosa.load(audio_path, sr=16000)

    # Process audio for model input
    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt")

    # Inference using model (no gradient needed for evaluation)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # Get prediction
    predicted_class_id = torch.argmax(logits, dim=-1).item()
    predicted_label = model.config.id2label[predicted_class_id]

    # Get confidence score (softmax)
    probabilities = torch.nn.functional.softmax(logits, dim=-1)[0]
    confidence = probabilities[predicted_class_id].item()

    return predicted_label, confidence

In [None]:
# Step 4: Run prediction on an audio file
if __name__ == "__main__":
    audio_file = "sample_audio.wav"  # Change this to your audio file path

    if os.path.exists(audio_file):
        label, confidence = detect_audio_fake(audio_file)
        print(f"\n🔍 Prediction: {label}")
        print(f"✅ Confidence: {confidence:.2f}")
    else:
        print(f"❌ Audio file not found: {audio_file}")

In [7]:
from transformers import pipeline

# Load the pre-trained pipeline
audio_classifier = pipeline("audio-classification", model="MelodyMachine/Deepfake-audio-detection-V2")

# Provide the path to your audio file (16kHz WAV format recommended)
audio_path = "/Users/fenilvadher/Documents/Collage Data/SEM - 6/AI/AI Project/fake_audio/real/speaker1_1.wav"

# Run the pipeline
results = audio_classifier(audio_path)

# Print results
print("Audio Classification Results:")
for res in results:
    print(f"Label: {res['label']}, Score: {res['score']:.4f}")


Device set to use mps:0


Audio Classification Results:
Label: real, Score: 1.0000
Label: fake, Score: 0.0000
