Imports 

In [None]:
# ===== Standard Libraries =====
import os
import time
import shutil
from pathlib import Path
from glob import glob

# ===== Data Handling =====
import numpy as np
import pandas as pd

# ===== Audio Processing =====
import librosa
import librosa.display
import soundfile as sf
import noisereduce as nr
import IPython.display as ipd

# ===== PyTorch =====
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# ===== Evaluation & Metrics =====
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix, ConfusionMatrixDisplay
)
from scipy.optimize import brentq
from scipy.interpolate import interp1d

# ===== Visualization & Utilities =====
import matplotlib.pyplot as plt
from tqdm import tqdm




Cuda test

In [None]:
# Checking to see if cuda is working
print(torch.cuda.is_available())    #should say true if available     
print(torch.cuda.get_device_name(0))   #should say the GPU you have, i.e for me it says NVIDIA GeForce RTX 3060 

Size of Dataset, i.e the number of total files

In [None]:
#Function used to see the number of files in the root of release-in-the-wild dataset

def count_root_files(root_dir):
    return len([
        f for f in os.listdir(root_dir) # List all items in the folder
        if os.path.isfile(os.path.join(root_dir, f))  # Only count actual files so no folders 
    ])

#path to the dataset 
folder_path = "datasets/release_in_the_wild"
file_count = count_root_files(folder_path)

#printing the result
print(f"Total files directly in '{folder_path}': {file_count}")



Dataset Sample, Split and Oversampling

In [None]:
# --------------------------------------------------------------

# Samples 40% of a deepfake audio dataset,
# splits it into stratified train/val/test sets (70/15/15),
# and saves both the split metadata and the .wav
# files themselves into separate folders.
# --------------------------------------------------------------


# Setting random seed to ensure reproducability
RANDOM_SEED = 42
folder_path = "datasets/release_in_the_wild"
meta_csv_path = os.path.join(folder_path, "meta.csv")
WAV_DIR = folder_path

# Defining the output directory of the spilt, so once the data takes that 40 split that data is split into train/val/test
OUTPUT_DIRS = {
    "train": os.path.join(folder_path, "train"),
    "val": os.path.join(folder_path, "val"),
    "test": os.path.join(folder_path, "test")
}

# Create output folders
for path in OUTPUT_DIRS.values():
    os.makedirs(path, exist_ok=True)

# Load and clean csv files 
df = pd.read_csv(meta_csv_path)
df["file"] = df["file"].str.strip()
df["label"] = df["label"].str.strip()
df["speaker"] = df["speaker"].str.strip()

# Sample 40% of full dataset, stratified by label 
df_40, _ = train_test_split(df, train_size=0.4, stratify=df["label"], random_state=RANDOM_SEED)

# Split sample into Train (70%), Val (15%), Test (15%) 
train_df, temp_df = train_test_split(df_40, test_size=0.30, stratify=df_40["label"], random_state=RANDOM_SEED)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["label"], random_state=RANDOM_SEED)


train_df = train_df.sample(frac=1.0, random_state=RANDOM_SEED)  # just shuffle it

# File copying function
def copy_files(subset_df, split_name):
    for _, row in subset_df.iterrows():
        src = os.path.join(WAV_DIR, row["file"])
        dst = os.path.join(OUTPUT_DIRS[split_name], row["file"])
        os.makedirs(os.path.dirname(dst), exist_ok=True)
        if os.path.exists(src):
            shutil.copy(src, dst)


# Copy files (uncomment to run if dataset has not yet been split) 
#copy_files(train_df, "train")
#copy_files(val_df, "val")
#copy_files(test_df, "test")


# Save csv (uncomment to run if csv files not yet created)
#train_df.to_csv(os.path.join(folder_path, "train_meta.csv"), index=False)
#val_df.to_csv(os.path.join(folder_path, "val_meta.csv"), index=False)
#test_df.to_csv(os.path.join(folder_path, "test_meta.csv"), index=False)

#  checking to ensure splits were done correct and rechecking the split of labels (spoof or bona-fide) for each sub set
def count_files(folder):
    return len([f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))])

print("40% of dataset sampled and split into train/val/test (70/15/15).")
print("Spoof samples oversampled in train set only.")
print(f"Train files: {count_files(OUTPUT_DIRS['train'])}")
print(f"Val files: {count_files(OUTPUT_DIRS['val'])}")
print(f"Test files: {count_files(OUTPUT_DIRS['test'])}")

print("\nLabel Distribution:")
print("Train:\n", train_df["label"].value_counts())
print("Val:\n", val_df["label"].value_counts())
print("Test:\n", test_df["label"].value_counts())





Audio Preprocess Function

In [None]:

# ================================================================
# This function prepares raw audio to be ready for model input.
# It:
# - Removes silence at the start/end of the audio.
# - Optionally applies noise reduction to clean the signal.
# - Optionally applies pre-emphasis to boost high frequencies.
# - Normalises volume using RMS or peak method.
# - Ensures the audio is a fixed length by padding or trimming.
# ================================================================

def preprocess_audio(y, sr, target_duration=6.0, apply_preemphasis=False, apply_reduction=False, coef=0.5, normalise='rms'):
    y, _ = librosa.effects.trim(y)

    # Apply noise reduction / dereverberation
    if apply_reduction:
        y = nr.reduce_noise(y=y, sr=sr)

    # Apply pre-emphasis
    if apply_preemphasis:
        y = librosa.effects.preemphasis(y, coef=coef)

    # Normalisation
    if normalise == 'rms':
        rms = np.sqrt(np.mean(y**2))
        y = y / (rms + 1e-6)
    elif normalise == 'peak':
        y = y / (np.max(np.abs(y)) + 1e-6)

    # Pad or trim to target length
    target_length = int(sr * target_duration)
    if len(y) < target_length:
        y = np.pad(y, (0, target_length - len(y)))
    else:
        y = y[:target_length]

    return y


Audio data visualisation - (see effect of preprocessing function on audio samples) 

In [None]:
# ================================================================
# This function compares real vs fake audio in two ways:
# 1. Before preprocessing (original)
# 2. After preprocessing (denoised, normalised, fixed-length)
#
# It visualises:
# - Waveforms side by side
# - Mel spectrograms side by side
# - Also plays back the audio for listening comparison
# ================================================================


def load_audio(path, sr=22050):
    y, _ = librosa.load(path, sr=sr)
    return y

# Visualise waveform comparison between real and fake audio
def show_waveform_comparison(y_real, y_fake, sr, title_suffix=""):
    fig, axes = plt.subplots(1, 2, figsize=(14, 3))
    librosa.display.waveshow(y_real, sr=sr, ax=axes[0])
    axes[0].set_title(f"Real - Waveform {title_suffix}")
    librosa.display.waveshow(y_fake, sr=sr, ax=axes[1])
    axes[1].set_title(f"Fake - Waveform {title_suffix}")
    plt.tight_layout()
    plt.show()

# Visualise mel spectrogram comparison
def show_mel_spectrogram_comparison(y_real, y_fake, sr, title_suffix=""):
    fig, axes = plt.subplots(1, 2, figsize=(14, 4))

    mel_real = librosa.feature.melspectrogram(y=y_real, sr=sr, n_mels=128)
    mel_fake = librosa.feature.melspectrogram(y=y_fake, sr=sr, n_mels=128)

    db_real = librosa.power_to_db(mel_real, ref=np.max)
    db_fake = librosa.power_to_db(mel_fake, ref=np.max)

    img1 = librosa.display.specshow(db_real, sr=sr, x_axis='time', y_axis='mel', ax=axes[0])
    axes[0].set_title(f"Real - Mel Spectrogram {title_suffix}")
    fig.colorbar(img1, ax=axes[0], format='%+2.0f dB')

    img2 = librosa.display.specshow(db_fake, sr=sr, x_axis='time', y_axis='mel', ax=axes[1])
    axes[1].set_title(f"Fake - Mel Spectrogram {title_suffix}")
    fig.colorbar(img2, ax=axes[1], format='%+2.0f dB')

    plt.tight_layout()
    plt.show()
# compares both audio files, but putting them side by side to be able to celay visaly see difference based on post and pre processing
def compare_audio(real_path, fake_path, apply_preprocessing=False, sr=22050):
    y_real = load_audio(real_path, sr=sr)
    y_fake = load_audio(fake_path, sr=sr)

    if apply_preprocessing:
        y_real = preprocess_audio(y_real, sr, apply_preemphasis=True, apply_reduction=True, normalise='rms')
        y_fake = preprocess_audio(y_fake, sr, apply_preemphasis=True, apply_reduction=True, normalise='rms')

    label = "(Preprocessed)" if apply_preprocessing else "(Original)"
    print(f"\n--- {label} ---")

    # Waveform
    show_waveform_comparison(y_real, y_fake, sr, title_suffix=label)

    # able to listen to the audio files
    print("Real Audio:")
    ipd.display(ipd.Audio(y_real, rate=sr))
    print("Fake Audio:")
    ipd.display(ipd.Audio(y_fake, rate=sr))

    # Mel spectrogram
    show_mel_spectrogram_comparison(y_real, y_fake, sr, title_suffix=label)

# Paths to files
real_audio_path = "datasets/release_in_the_wild/72.wav"
fake_audio_path = "datasets/release_in_the_wild/2.wav"

# Run comparisons
compare_audio(real_audio_path, fake_audio_path, apply_preprocessing=False)
compare_audio(real_audio_path, fake_audio_path, apply_preprocessing=True)





Visualising each feature I intend to extract to help see differences in bona-fide and spoof audio samples

In [None]:
# ================================================================
# This function visually compares feature representations of
# a real and a fake audio clip after preprocessing and feature extraction.
# 
# It extracts the 10 features using librosa:
# - Mel Spectrogram, MFCC, Chroma, Tonnetz, Spectral Contrast,
#   Spectral Centroid, Pitch (YIN), Energy (RMS), ZCR, Onset Strength
#
# For each feature, it shows side-by-side plots:
#   Left: Real audio
#   Right: Fake audio.
# ================================================================

class AudioFeatureComparator:
    def __init__(self, y_real, y_fake, sr):
        self.sr = sr
        
        # Extract features for both real and fake audio
        self.features_real = self._extract_features(y_real)
        self.features_fake = self._extract_features(y_fake)

    def _extract_features(self, y):
        return {
            "mel_spectrogram": librosa.power_to_db(
                librosa.feature.melspectrogram(y=y, sr=self.sr, n_mels=128), ref=np.max
            ),
            "mfcc": librosa.feature.mfcc(y=y, sr=self.sr, n_mfcc=20),
            "chroma": librosa.feature.chroma_stft(y=y, sr=self.sr),
            "tonnetz": librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=self.sr),
            "spectral_contrast": librosa.feature.spectral_contrast(y=y, sr=self.sr),
            "spectral_centroid": librosa.feature.spectral_centroid(y=y, sr=self.sr),
            "pitch": librosa.yin(y, fmin=50, fmax=300, sr=self.sr),
            "energy": librosa.feature.rms(y=y),
            "zcr": librosa.feature.zero_crossing_rate(y),
            "onset_strength": librosa.onset.onset_strength(y=y, sr=self.sr),
        }

    def plot_feature(self, key, y_axis='linear'):
       # Plot one feature side-by-side (spoof vs bona-fide)
        feat_real = self.features_real[key]
        feat_fake = self.features_fake[key]

        fig, axes = plt.subplots(1, 2, figsize=(14, 3))
        for ax, feat, title in zip(
            axes, [feat_real, feat_fake], ["Real", "Fake"]
        ):
             # For flat features like pitch/energy/ZCR
            if feat.ndim == 1 or feat.shape[0] == 1:
                ax.plot(feat.T)
                ax.set_title(f"{title} - {key}")
                ax.set_xlabel("Frames")
            # For 2D features like mel or MFCC
            else:
                img = librosa.display.specshow(
                    feat, sr=self.sr, x_axis='time', y_axis=y_axis, ax=ax
                )
                fig.colorbar(img, ax=ax, format="%+2.0f dB")
                ax.set_title(f"{title} - {key}")
        plt.tight_layout()
        plt.show()

    def plot_all(self):
        axis_map = {
            "mel_spectrogram": "mel",
            "mfcc": "linear",
            "chroma": "chroma",
            "tonnetz": "tonnetz",
            "spectral_contrast": "linear",
            "spectral_centroid": "linear",
            "pitch": "linear",
            "energy": "linear",
            "zcr": "linear",
            "onset_strength": "linear"
        }
        for key in self.features_real:
            self.plot_feature(key, y_axis=axis_map.get(key, 'linear'))

# Preprocess two example files
y_real = preprocess_audio(load_audio("datasets/release_in_the_wild/72.wav"), sr=22050, apply_preemphasis=True, apply_reduction=True)
y_fake = preprocess_audio(load_audio("datasets/release_in_the_wild/2.wav"), sr=22050, apply_preemphasis=True, apply_reduction=True)

# Create the visual comparator
comparator = AudioFeatureComparator(y_real, y_fake, sr=22050)
comparator.plot_all()


Feature Extraction Function (With saving feature to save time for future training/validation)

In [None]:
# ================================================================
# This function loads all `.wav` audio files from the specified 
# input folder structure (train/val/test) and:
# - Applies preprocessing (normalisation, padding, etc.)
# - Extracts the 10 audio features using librosa
# - Saves the features as `.npy` files under:
#     output_root/preprocessed_<split>/<feature_name>/<filename>.npy
#
# This allows the model to train on consistent, aligned features
# across all sets, and ready to be passed into the the audiofeatuedataset 
# ================================================================

def extract_and_save_all(input_root, output_root, sr=22050, target_duration=6.0, apply_preemphasis=False, coef=0.5, normalise='rms'):
    input_root = Path(input_root)
    output_root = Path(output_root)

    for split in ["train", "val","test"]:  
        input_folder = input_root / split
        output_base = output_root / f"preprocessed_{split}"

        print(f"Looking in: {input_folder}")
        wav_files = [f for f in input_folder.glob("*.wav")]
        print(f"Found {len(wav_files)} files in '{split}'")

        for wav_file in tqdm(wav_files):
            try:
                # Load audio
                y, _ = librosa.load(wav_file, sr=sr)
                
                # Apply standard preprocessing
                y = preprocess_audio(y, sr, target_duration, apply_preemphasis, coef, normalise)

                base_name = wav_file.stem + ".npy"
                # using the same feature extraction settings as seen in audio data visualisation
                feature_dict = {
                    "mel_spectrogram": librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128),
                    "mfcc": librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20),
                    "chroma": librosa.feature.chroma_stft(y=y, sr=sr),
                    "tonnetz": librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr),
                    "spectral_contrast": librosa.feature.spectral_contrast(y=y, sr=sr),
                    "spectral_centroid": librosa.feature.spectral_centroid(y=y, sr=sr),
                    "pitch": librosa.yin(y, fmin=50, fmax=300, sr=sr),
                    "energy": librosa.feature.rms(y=y),
                    "zcr": librosa.feature.zero_crossing_rate(y),
                    "onset_strength": librosa.onset.onset_strength(y=y, sr=sr)
                }
                # Save each feature to its own folder under train/val/test
                for feature_name, data in feature_dict.items():
                    out_path = output_base / feature_name / base_name
                    out_path.parent.mkdir(parents=True, exist_ok=True)
                    np.save(out_path, data.astype(np.float32))

            except Exception as e:
                print(f"[ERROR] {wav_file.name}: {e}")

#Uncomment the line below to preproecess and extact the 5 features types (for train/val/test) if not already done

#extract_and_save_all("datasets/release_in_the_wild", "datasets/release_in_the_wild")

In [None]:

# ===============================================================
# Custom Dataset class for deepfake audio classification.
# 
# - Loads file paths and labels from a meta CSV (bona-fide / spoof).
# - Loads 10 pre-extracted features (e.g. MFCC, pitch, ZCR, etc.)
#   stored as `.npy` files.
# - Each feature is padded or cropped to a uniform shape for batching.
# - Returns a tuple: (features..., label) for training or testing.
# ===============================================================


import torch.nn.functional as F
class AudioFeatureDataset(Dataset):
    def __init__(self, meta_csv, feature_root,
                 features=['chroma', 'energy', 'mel_spectrogram', 'mfcc',
                           'onset_strength', 'pitch', 'spectral_centroid',
                           'spectral_contrast', 'tonnetz', 'zcr'],
                 target_shape=(128, 259)):  
        
        # Load metadata from CSV
        self.df = pd.read_csv(meta_csv)
        self.df["label"] = self.df["label"].str.strip().str.lower()
        self.feature_root = feature_root
        self.features = features
        self.label_map = {'bona-fide': 1, 'spoof': 0}
        self.target_shape = target_shape

    def __len__(self):
        return len(self.df)
    # Ensure all audio files are the same length
    def pad_or_resize(self, tensor, target_shape):
        h, w = tensor.shape
        pad_h = target_shape[0] - h
        pad_w = target_shape[1] - w

        if pad_h < 0 or pad_w < 0:
            # crop if larger than target
            tensor = tensor[:target_shape[0], :target_shape[1]] 
        else:
            # pad if smaller than target
            tensor = F.pad(tensor, (0, pad_w, 0, pad_h))  
        return tensor

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        file_id = os.path.splitext(row["file"])[0] + ".npy"
        label_raw = row["label"]
        if label_raw not in self.label_map:
            raise ValueError(f"Unknown label: '{label_raw}' at idx {idx}")
        label = self.label_map[label_raw]

        feature_arrays = []
        for feat in self.features:
            path = os.path.join(self.feature_root, feat, file_id) 
            if not os.path.exists(path):
                raise FileNotFoundError(f"Missing file: {path}")
            feature = np.load(path)
            tensor = torch.tensor(feature, dtype=torch.float32)
            if tensor.dim() == 1:
                tensor = tensor.unsqueeze(0)
            tensor = self.pad_or_resize(tensor, self.target_shape)
            feature_arrays.append(tensor)

        return (*feature_arrays, torch.tensor(label, dtype=torch.float32))






Model Architecture (DenseNN + Siamese)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F


# ===============================================================
#
# Architecture for the binary classification hybrid model
# - Processes 10 audio features (MFCC, pitch, energy, etc.)
# - MFCC handled by a CNN (Siamese-style)
# - All other features pass through feedforward dense branches
# - Final output: sigmoid score (0 = fake, 1 = real)
# ===============================================================

# Used for the all features  but mfcc
class DenseNeuralNetwork(nn.Module):
    def __init__(self, input_dim, output_dim=128):
        super(DenseNeuralNetwork, self).__init__()
       
        # First dense layer with batch norm
        self.fc1 = nn.Linear(input_dim, 256)
        self.bn1 = nn.BatchNorm1d(256)

        # Second dense layer with batch norm
        self.fc2 = nn.Linear(256, 256)
        self.bn2 = nn.BatchNorm1d(256)

        # Final dense layer to reduce to output_dim
        self.fc3 = nn.Linear(256, output_dim)
        self.bn3 = nn.BatchNorm1d(output_dim)

        # Dropout for regularisation and ReLU activation
        self.dropout = nn.Dropout(0.3)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = self.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = self.relu(self.bn3(self.fc3(x)))
        return x
    



class SiameseMFCCBranch(nn.Module):
    def __init__(self):
        super(SiameseMFCCBranch, self).__init__()
        # 1st convolutional layer
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        # 2nd convolutional layer
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        # 3rd convolutional layer
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)

        # Max pooling and dropout
        self.pool = nn.MaxPool2d(2)
        self.dropout = nn.Dropout(0.3)

        # Flattened size expected after conv + pooling
        self.flattened_size = 128 * 32 * 64
        # Fully connected layer to get 128-dim output
        self.fc = nn.Linear(self.flattened_size, 128)
        # dropout after FC
        self.fc_dropout = nn.Dropout(0.3)  


    def forward(self, x):
        x = F.relu(self.conv1(x))       
        x = self.pool(F.relu(self.conv2(x)))  
        x = self.pool(F.relu(self.conv3(x)))  
        x = self.dropout(x)

        # Flatten to ensure shape compatability 
        x = x.view(x.size(0), -1)     
        x = self.fc(x)                 
        return x


# Final Fusion Model
class AudioDeepfakeFusionModel(nn.Module):
    def __init__(self):
        super(AudioDeepfakeFusionModel, self).__init__()

        # CNN-based branch for MFCC
        self.mfcc_branch = SiameseMFCCBranch()

        # MLP branches for the remaining 9 features
        self.chroma_branch = DenseNeuralNetwork(input_dim=128)
        self.tonnetz_branch = DenseNeuralNetwork(input_dim=128)
        self.contrast_branch = DenseNeuralNetwork(input_dim=128)
        self.pitch_branch = DenseNeuralNetwork(input_dim=128)
        self.energy_branch = DenseNeuralNetwork(input_dim=128)
        self.zcr_branch = DenseNeuralNetwork(input_dim=128)
        self.onset_branch = DenseNeuralNetwork(input_dim=128)
        self.centroid_branch = DenseNeuralNetwork(input_dim=128)
        self.mel_spec_branch = DenseNeuralNetwork(input_dim=128)

        # Fusion layer that combines all 10 feature vectors into a larger vector
        self.fusion_layer = nn.Sequential(
           
            # Combine all branches
            nn.Linear(10 * 128, 512),  
            #Adding ReLU activation function    
            nn.ReLU(),
            # Adding 0.3 dropout to help prevent overfitting 
            nn.Dropout(0.3),
           
            # Reduce to 256-dim
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
        )

        # Final classifier layer (binary output)
        self.output_layer = nn.Linear(256, 1)

    def forward(self, mfcc, chroma, tonnetz, contrast, pitch, energy, zcr, onset, centroid, mel_spec):

        # MFCC input is 2D, add channel dim for CNN: (B, 1, H, W)
        mfcc = mfcc.unsqueeze(1)  

        # Pooling for time-dimension on all 1D features
        def pool(x): return x.mean(dim=-1)

        # Forward through each branch
        mfcc_out = self.mfcc_branch(mfcc)
        chroma_out = self.chroma_branch(pool(chroma))
        tonnetz_out = self.tonnetz_branch(pool(tonnetz))
        contrast_out = self.contrast_branch(pool(contrast))
        pitch_out = self.pitch_branch(pool(pitch))
        energy_out = self.energy_branch(pool(energy))
        zcr_out = self.zcr_branch(pool(zcr))
        onset_out = self.onset_branch(pool(onset))
        centroid_out = self.centroid_branch(pool(centroid))
        mel_spec_out = self.mel_spec_branch(pool(mel_spec))

        # Concatenate all feature vectors into one
        fusion = torch.cat([
            mfcc_out, chroma_out, tonnetz_out, contrast_out,
            pitch_out, energy_out, zcr_out, onset_out, centroid_out, mel_spec_out
        ], dim=1)

        # Forward pass through fusion and output layer
        x = self.fusion_layer(fusion)

        # Binary output in range [0, 1]
        return torch.sigmoid(self.output_layer(x))  


Training Loop

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix, ConfusionMatrixDisplay
)
import matplotlib.pyplot as plt
from tqdm import tqdm
from pathlib import Path
import time
# ===============================================================
# this function is used to create metrics for deepfake detection.
# - Calculates accuracy, precision, recall, F1, AUC, and EER.
# - Optionally plots ROC curve and confusion matrix side by side.
# - Can be used during training or evaluation phase.
# ===============================================================


def evaluate_verification_metrics(preds, probs, labels, threshold=0.5, title="ROC Curve", plot=True):
    
    # Convert predicted probabilities into binary predictions using threshold
    preds = (np.array(probs) >= threshold).astype(int)

    # Ensure inputs are NumPy arrays
    labels = np.array(labels)
    probs = np.array(probs)

    #  Compute classification metrics 
    acc = accuracy_score(labels, preds)
    prec = precision_score(labels, preds, zero_division=0)
    rec = recall_score(labels, preds, zero_division=0)
    f1 = f1_score(labels, preds, zero_division=0)
    auc = roc_auc_score(labels, probs)

    # Compute EER (Equal Error Rate) 
    fpr, tpr, _ = roc_curve(labels, probs)
    eer = fpr[np.nanargmin(np.abs((1 - tpr) - fpr))]

    # Print result
    print(
        f"{title} | "
        f"Accuracy: {acc:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f} | "
        f"F1: {f1:.4f} | AUC: {auc:.4f} | EER: {eer:.4f}"
    )

    if plot:
        plt.figure(figsize=(12, 4))

        # ROC Curve
        plt.subplot(1, 2, 1)
        plt.plot(fpr, tpr, label=f"AUC = {auc:.2f}")
        plt.plot([0, 1], [0, 1], linestyle='--', color='grey')
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title("ROC Curve")
        plt.legend()
        plt.grid(True)

        # Confusion Matrix
        cm = confusion_matrix(labels, preds)
        disp = ConfusionMatrixDisplay(cm, display_labels=["Spoof", "Bona-fide"])
        plt.subplot(1, 2, 2)
        disp.plot(cmap=plt.cm.Blues, ax=plt.gca(), colorbar=False)
        plt.title("Confusion Matrix")

        plt.tight_layout()
        plt.show()
        
    #  Return metrics as a dictionary for logging
    return {
        'accuracy': acc, 'precision': prec, 'recall': rec,
        'f1_score': f1, 'auc': auc, 'eer': eer
    }



In [None]:
# ===============================================================
# Train loop to train the binary classifer
#  - Uses BCELoss for binary output (sigmoid).
# - Tracks loss and accuracy per epoch.
# - Evaluates performance with precision, recall, F1, AUC, and EER.
# - Optionally saves model weights and plots training history.
# - not the best version of this code,it good weights were recored
#  in pth_models, hence difference in result here/ this one is 
# overfitting, and generalising well!
# ===============================================================


def train_binary(
    model,
    dataset,
    epochs=20,
    batch_size=32,
    lr=1e-4,
    device='cuda' if torch.cuda.is_available() else 'cpu',
    save_model_path="model-weights/binary_model.pth"
):
    # Move model to the appropriate device (GPU or CPU) 
    model = model.to(device)

    # Set up dataloader
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True,drop_last=True)

    # Define loss function (Binary Cross Entropy)
    criterion = nn.BCELoss()

    # Optimizer and learning rate scheduler
    optimizer = torch.optim.Adam(model.parameters(), lr=lr,weight_decay=1e-5)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

    # Ensure save directory exists
    save_model_path = Path(save_model_path)
    save_model_path.parent.mkdir(parents=True, exist_ok=True)

    # Lists to track metrics
    epoch_losses = []
    epoch_accuracies = []

     # === Training loop ===
    for epoch in range(1, epochs + 1):
        start_time = time.time()

        model.train()
        total_loss = 0.0
        all_labels, all_preds, all_probs = [], [], []

        # tqdm progress bar per epoch
        loop = tqdm(dataloader, desc=f"Epoch {epoch}/{epochs}")
        for batch in loop:

            # Unpack and move inputs/labels to device
            *inputs, labels = batch
            inputs = [x.to(device) for x in inputs]
            labels = labels.float().to(device).unsqueeze(1)

            # Forward pass
            outputs = model(*inputs)
            loss = criterion(outputs, labels)

            # Backward + optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Accumulate loss
            total_loss += loss.item()

            # Convert outputs to predictions
            probs = outputs.detach().cpu().numpy().flatten()
            preds = (probs >= 0.5).astype(int)
            truths = labels.detach().cpu().numpy().flatten()

            # Store for metric tracking
            all_probs.extend(probs)
            all_preds.extend(preds)
            all_labels.extend(truths)

            # Show loss live during training
            loop.set_postfix(loss=loss.item())

        # End of epoch
        avg_loss = total_loss / len(dataloader)
        epoch_losses.append(avg_loss)

        acc = accuracy_score(all_labels, all_preds)
        epoch_accuracies.append(acc)

        duration = time.time() - start_time
        print(f"\n[Epoch {epoch}] Duration: {duration:.2f}s | Loss: {avg_loss:.4f} | Accuracy: {acc:.4f}")

        # Evaluate metrics (without plotting roc curve and feature matrix during training, only at the end)
        evaluate_verification_metrics(all_preds, all_probs, all_labels, threshold=0.5, plot=False)

        # Step LR scheduler
        scheduler.step()
        for pg in optimizer.param_groups:
            print(f"Learning Rate: {pg['lr']:.6f}")

    # Save final model weights
    torch.save(model.state_dict(), save_model_path)
    print(f"\nModel saved to '{save_model_path}'")

    # Final evaluation with visualisation
    evaluate_verification_metrics(all_preds, all_probs, all_labels, threshold=0.5, title="Final Evaluation", plot=True)

    # Plotting loss and accuracy curves 
    plt.figure(figsize=(10, 4))

    # Loss curve
    plt.subplot(1, 2, 1)
    plt.plot(range(1, epochs + 1), epoch_losses, marker='o')
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Loss Over Epochs")
    plt.grid(True)

    # Accuracy curve
    plt.subplot(1, 2, 2)
    plt.plot(range(1, epochs + 1), epoch_accuracies, marker='o', color='green')
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.title("Accuracy Over Epochs")
    plt.grid(True)

    plt.tight_layout()
    plt.show()


In [None]:

# run the train binary classification model

"""
train_binary(
    model=AudioDeepfakeFusionModel(),
    dataset=AudioFeatureDataset(
        "datasets/release_in_the_wild/train_meta.csv",  
        "datasets/release_in_the_wild/preprocessed_train"  
    ),
    epochs=50,
    batch_size=32,
    lr=0.0005,
    device='cuda',
    save_model_path="model-weights/df_model.pth"
)

"""

Run Train + Val 

In [None]:
# ===============================================================
# Evaluates trained binary deepfake detection model on a test dataset.
# - Computes: Accuracy, Precision, Recall, F1 Score, AUC, and EER
# - Displays a Confusion Matrix
# - Takes: a model, test dataset, and runs inference using no gradients
# ===============================================================

def evaluate_on_test_set(model, test_dataset, batch_size=32, device='cuda' if torch.cuda.is_available() else 'cpu'):
    # Prepare test data loader
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    # Set model to evaluation mode and move to device
    model.eval()
    model.to(device)

    # Lists to collect predictions and labels
    all_probs = []
    all_preds = []
    all_labels = []
#
    # No gradients needed for evaluation
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Testing"):
            # Split inputs and labels, move to device
            inputs = [b.to(device) for b in batch[:-1]]
            labels = batch[-1].float().to(device).unsqueeze(1)

            # Model inference
            outputs = model(*inputs)
            probs = torch.sigmoid(outputs).squeeze().cpu().numpy()
            preds = (probs > 0.5).astype(int)

            # Collect batch results
            all_probs.extend(probs)
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    # Convert all results to numpy arrays
    all_labels = np.array(all_labels).astype(int)
    all_preds = np.array(all_preds)
    all_probs = np.array(all_probs)

    # Compute metrics 
    acc  = accuracy_score(all_labels, all_preds)
    prec = precision_score(all_labels, all_preds, zero_division=0)
    rec  = recall_score(all_labels, all_preds, zero_division=0)
    f1   = f1_score(all_labels, all_preds, zero_division=0)
    auc  = roc_auc_score(all_labels, all_probs)

    # Compute Equal Error Rate (EER)
    fpr, tpr, thresholds = roc_curve(all_labels, all_probs)
    fnr = 1 - tpr
    eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)

    # Print results 
    print("\nTest Set Evaluation:")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    print(f"AUC:       {auc:.4f}")
    print(f"EER:       {eer:.4f}")

    #Show confusion matrix 
    cm = confusion_matrix(all_labels, all_preds)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Spoof", "Bona-fide"])
    disp.plot(cmap=plt.cm.Blues)
    plt.title("Confusion Matrix - Test Set")
    plt.show()




# Create model instance
model = AudioDeepfakeFusionModel()

# Load model weights 
model.load_state_dict(torch.load("model-weights/df_model.pth"))

# Load test dataset 
release_in_wild_test_dataset = AudioFeatureDataset(
    meta_csv="datasets/release_in_the_wild/val_meta.csv",
    feature_root="datasets/release_in_the_wild/preprocessed_val",
    features=[
        'mfcc', 'chroma', 'tonnetz', 'spectral_contrast',
        'pitch', 'energy', 'zcr', 'onset_strength', 'spectral_centroid', 'mel_spectrogram'
    ]
)

# Run evaluation
evaluate_on_test_set(model, release_in_wild_test_dataset)


# Load test dataset 
release_in_wild_test_dataset = AudioFeatureDataset(
    meta_csv="datasets/release_in_the_wild/test_meta.csv",
    feature_root="datasets/release_in_the_wild/preprocessed_test",
    features=[
        'mfcc', 'chroma', 'tonnetz', 'spectral_contrast',
        'pitch', 'energy', 'zcr', 'onset_strength', 'spectral_centroid', 'mel_spectrogram'
    ]
)

# Run evaluation
evaluate_on_test_set(model, release_in_wild_test_dataset)






EVALUATING ON NEW DATASETS 

Evaluation of model
For 2-sec dataset

In [None]:
# ================================================================
# This function loads all `.wav` audio files from the specified 
# input folder structure (ttest) and:
# - Applies preprocessing 
# - Extracts the 10 audio features using librosa
# - Saves the features as `.npy` files under:
#     output_root/preprocessed_<split>/<feature_name>/<filename>.npy
#
# This allows the model to train on consistent, aligned features
# across all sets, and ready to be passed into the the audiofeatuedataset 
# ================================================================

def extract_and_save_all(input_root, output_root, sr=22050, target_duration=6.0,
                         apply_preemphasis=False, coef=0.5, normalise='rms'):
    input_root = Path(input_root)
    output_root = Path(output_root)

    for split in ["testing"]:
        for label_dir in ["real", "fake"]:
            input_folder = input_root / split / label_dir
            output_base = output_root / f"preprocessed_{split}" / label_dir

            print(f"Looking in: {input_folder}")
            wav_files = sorted(list(input_folder.glob("*.wav")))
            print(f"Found {len(wav_files)} files in '{split}/{label_dir}'")

            for wav_file in tqdm(wav_files, desc=f"Processing {label_dir}"):
                try:
                    y, _ = librosa.load(wav_file, sr=sr)
                    y = preprocess_audio(y, sr, target_duration, apply_preemphasis, coef, normalise)

                    clean_name = wav_file.stem.split(".")[0] + ".npy"

                    feature_dict = {
                        "mel_spectrogram": librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128),
                        "mfcc": librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20),
                        "chroma": librosa.feature.chroma_stft(y=y, sr=sr),
                        "tonnetz": librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr),
                        "spectral_contrast": librosa.feature.spectral_contrast(y=y, sr=sr),
                        "spectral_centroid": librosa.feature.spectral_centroid(y=y, sr=sr),
                        "pitch": librosa.yin(y, fmin=50, fmax=300, sr=sr),
                        "energy": librosa.feature.rms(y=y),
                        "zcr": librosa.feature.zero_crossing_rate(y=y),
                        "onset_strength": librosa.onset.onset_strength(y=y, sr=sr)
                    }

                    for feature_name, data in feature_dict.items():
                        out_path = output_base / feature_name / clean_name
                        out_path.parent.mkdir(parents=True, exist_ok=True)
                        np.save(out_path, data.astype(np.float32))

                except Exception as e:
                    print(f"[ERROR] {wav_file.name}: {e}")

"""
extract_and_save_all(
    input_root="datasets/evaluation/for-2sec/for-2seconds",
    output_root="datasets/evaluation/for-2sec/for-2seconds",
    sr=16000, target_duration=6.0
)
"""



In [None]:
def generate_test_meta_csv(real_dir, fake_dir, save_path):
    entries = []
    for path in sorted(Path(real_dir).glob("*.wav")):
        base = path.name.split(".")[0]  
        entries.append({"file": base + ".npy", "label": 1})
    for path in sorted(Path(fake_dir).glob("*.wav")):
        base = path.name.split(".")[0]
        entries.append({"file": base + ".npy", "label": 0})
    pd.DataFrame(entries).to_csv(save_path, index=False)



generate_test_meta_csv(
    real_dir="datasets/evaluation/for-2sec/for-2seconds/testing/real",
    fake_dir="datasets/evaluation/for-2sec/for-2seconds/testing/fake",
    save_path="datasets/evaluation/for-2sec/for-2seconds/test_meta.csv"
)


In [None]:
# ===============================================================
# Custom Dataset class for deepfake audio classification.
# 
# - Loads file paths and labels from a meta CSV (bona-fide / spoof).
# - Loads 10 pre-extracted features (e.g. MFCC, pitch, ZCR, etc.)
#   stored as `.npy` files.
# - Each feature is padded or cropped to a uniform shape for batching.
# - Returns a tuple: (features..., label) for training or testing.
# ===============================================================

class AudioFeatureDataset(Dataset):
    def __init__(self, meta_csv, feature_root,
                 features=['chroma', 'energy', 'mel_spectrogram', 'mfcc',
                           'onset_strength', 'pitch', 'spectral_centroid',
                           'spectral_contrast', 'tonnetz', 'zcr'],
                 target_shape=(128, 259)):
        self.df = pd.read_csv(meta_csv)
        self.df["label"] = self.df["label"].astype(int) 
        self.feature_root = feature_root
        self.features = features
        self.label_map = {1: 1, 0: 0}
        self.target_shape = target_shape

    def __len__(self):
        return len(self.df)

    def _pad_or_resize(self, tensor, target_shape):
        h, w = tensor.shape
        pad_h = target_shape[0] - h
        pad_w = target_shape[1] - w

        if pad_h < 0 or pad_w < 0:
            return tensor[:target_shape[0], :target_shape[1]]
        return F.pad(tensor, (0, pad_w, 0, pad_h))

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        label_raw = row["label"]
        if label_raw not in self.label_map:
            raise ValueError(f"Unknown label: '{label_raw}' at idx {idx}")
        label = self.label_map[label_raw]

        label_dir = "real" if label == 1 else "fake"

        # Construct the file path
        file_id = row["file"]
        feature_arrays = []
        for feat in self.features:
            path = os.path.join(self.feature_root, label_dir, feat, file_id)
            if not os.path.exists(path):
                raise FileNotFoundError(f"Missing file: {path}")

            arr = np.load(path)
            tensor = torch.tensor(arr, dtype=torch.float32)
            if tensor.dim() == 1:
                tensor = tensor.unsqueeze(0)
            tensor = self._pad_or_resize(tensor, self.target_shape)
            feature_arrays.append(tensor)

        return (*feature_arrays, torch.tensor(label, dtype=torch.float32))

In [None]:
# ===============================================================
# Evaluates trained binary deepfake detection model on a test dataset.
# - Computes: Accuracy, Precision, Recall, F1 Score, AUC, and EER
# - Displays a Confusion Matrix
# - Takes: a model, test dataset, and runs inference using no gradients
# ===============================================================

def evaluate_on_test_set(model, test_dataset,
                         batch_size=32,
                         device='cuda' if torch.cuda.is_available() else 'cpu'):
    # Create DataLoader for batching test data
    loader = DataLoader(test_dataset, batch_size=batch_size)
    
    # Set model to evaluation mode and move it to the correct device
    model.eval()
    model.to(device)

    # Containers for predicted probabilities, labels, and binary predictions
    all_probs, all_preds_05, all_labels = [], [], []

    # Inference without tracking gradients
    with torch.no_grad():
        for batch in tqdm(loader, desc="Testing"):
            
            # Unpack features and labels from batch
            *features, labels = batch

            # Move features and labels to device
            features = [f.to(device) for f in features]
            labels = labels.to(device).unsqueeze(1)

            # Forward pass through model
            outputs = model(*features)

            # Apply sigmoid to get probabilities
            probs = torch.sigmoid(outputs).cpu().numpy().squeeze()

            # Binary prediction at threshold 0.5
            preds_05 = (probs > 0.5).astype(int)

            # Collect results
            all_probs.extend(probs.tolist())
            all_preds_05.extend(preds_05.tolist())
            all_labels.extend(labels.cpu().numpy().astype(int).tolist())

    # Convert lists to numpy arrays
    all_labels = np.array(all_labels)
    all_probs  = np.array(all_probs)
    all_preds_05 = np.array(all_preds_05)

    # ROC Curve & AUC 
    fpr, tpr, thresholds = roc_curve(all_labels, all_probs)
    auc = roc_auc_score(all_labels, all_probs)

    #  Equal Error Rate (EER) 
    eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)

    # Find optimal threshold (maximizing TPR - FPR) 
    optimal_idx = np.argmax(tpr - fpr)
    optimal_threshold = thresholds[optimal_idx]
    preds_optimal = (all_probs > optimal_threshold).astype(int)

    #  Evaluation at default threshold (0.5) 
    print("\n--- Evaluation at Threshold = 0.5 ---")
    print(f"Accuracy: {accuracy_score(all_labels, all_preds_05):.4f}")
    print(f"Precision:{precision_score(all_labels, all_preds_05, zero_division=0):.4f}")
    print(f"Recall:{recall_score(all_labels, all_preds_05, zero_division=0):.4f}")
    print(f"F1 Score:{f1_score(all_labels, all_preds_05, zero_division=0):.4f}")

    #  Evaluation at optimal threshold 
    print("\n--- Evaluation at Optimal Threshold ---")
    print(f"Optimal Threshold: {optimal_threshold:.4f}")
    print(f"Accuracy:{accuracy_score(all_labels, preds_optimal):.4f}")
    print(f"Precision:{precision_score(all_labels, preds_optimal, zero_division=0):.4f}")
    print(f"Recall:{recall_score(all_labels, preds_optimal, zero_division=0):.4f}")
    print(f"F1 Score:{f1_score(all_labels, preds_optimal, zero_division=0):.4f}")

    # Summary stats 
    print(f"\nAUC:       {auc:.4f}")
    print(f"EER:       {eer:.4f}")

    #  Confusion Matrix (Optimal Threshold) 
    cm = confusion_matrix(all_labels, preds_optimal)
    disp = ConfusionMatrixDisplay(cm, display_labels=["spoof", "bona-fide"])
    disp.plot(cmap=plt.cm.Blues)
    plt.title("Confusion Matrix @ Optimal Threshold")
    plt.tight_layout()
    plt.show()

    # ROC Curve with EER point marked 
    eer_x = eer
    eer_y = 1 - eer

    plt.figure(figsize=(6, 5))
    plt.plot(fpr, tpr, label=f"AUC = {auc:.4f}")
    plt.plot([0, 1], [0, 1], linestyle='--', color='grey', label="Random Guess")
    plt.plot(eer_x, eer_y, 'ro', label=f"EER = {eer:.4f}")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend(loc="lower right")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:

# ===============================================================
# Run the evaluation on the dataset
# ===============================================================

# Ensure the model is defined and pre-trained weights are loaded
model = AudioDeepfakeFusionModel()
model.load_state_dict(torch.load("model-weights/df_model.pth", map_location=torch.device('cpu')))

for_2sec_test_dataset = AudioFeatureDataset(
    meta_csv="datasets/evaluation/for-2sec/for-2seconds/test_meta.csv",
    feature_root="datasets/evaluation/for-2sec/for-2seconds/preprocessed_testing",
    features=[
        'mfcc', 'chroma', 'tonnetz', 'spectral_contrast',
        'pitch', 'energy', 'zcr', 'onset_strength', 'spectral_centroid', 'mel_spectrogram'
    ]
)
evaluate_on_test_set(model, for_2sec_test_dataset)


Evaluation of ASVspoof2019 LA dataset

In [None]:
# ===============================================================
# Generates metadata CSV for the ASVspoof 2019 LA evaluation set.
# - Parses a protocol file to extract speaker, file name, and label
# - Checks for missing audio files in the provided directory
# - Outputs a CSV with 'file', 'speaker', and binary 'label' (1 = bonafide, 0 = spoof)
# ===============================================================


def make_la_eval_metadata(
    protocol_path: str,
    audio_dir: str,
    output_csv: str
):
    # Convert to Path object for easier handling
    protocol = Path(protocol_path)

    # Ensure the protocol file exists
    if not protocol.exists():
        raise FileNotFoundError(f"Cannot find protocol file: {protocol}")

    entries = []

    # Read and parse each line from the protocol file
    with protocol.open('r') as f:
        for line in f:
            cols = line.strip().split()

            # Skip malformed lines
            if len(cols) != 5:
                continue


            speaker, audio_id, _, _, key = cols
            filename = audio_id + ".flac"
            audio_path = Path(audio_dir) / filename

            # Warn if the corresponding audio file is missing
            if not audio_path.exists():
                print(f"[WARNING] Audio missing: {audio_path}")
                continue

            # Assign label: 1 for bonafide, 0 for spoof
            label = 1 if key.lower() == "bonafide" else 0

            # Append entry for CSV
            entries.append({
                "file":    filename,
                "speaker": speaker,
                "label":   label
            })

    # Create DataFrame and save as CSV
    df = pd.DataFrame(entries, columns=["file", "speaker", "label"])
    df.to_csv(output_csv, index=False)
    print(f"Saved {len(df)} rows to {output_csv}")

# Run the function to generate metadata CSV
make_la_eval_metadata(
    protocol_path="datasets/evaluation/LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.eval.trl.txt",
    audio_dir="datasets/evaluation/LA/ASVspoof2019_LA_eval/flac",          
    output_csv="datasets/evaluation/LA/LA_eval_meta.csv"
)


In [None]:
# ================================================================
# This function loads all `.wav` audio files from the specified 
# input folder structure (train/val/test) and:
# - Applies preprocessing (normalisation, padding, etc.)
# - Extracts the 10 audio features using librosa
# - Saves the features as `.npy` files under:
#     output_root/preprocessed_<split>/<feature_name>/<filename>.npy
#
# This allows the model to train on consistent, aligned features
# across all sets, and ready to be passed into the the audiofeatuedataset 
#  has a few changes to the version used in training with release-in-the-wild
# ================================================================


def extract_and_save_la_features_with_eta(
    meta_csv: str,
    audio_root: str,
    output_root: str,
    sr: int = 16000,
    target_duration: float = 6.0,
    apply_preemphasis: bool = False,
    coef: float = 0.5,
    normalise: str = "rms",
):
    df = pd.read_csv(meta_csv)     
    audio_root  = Path(audio_root)
    output_base = Path(output_root) / "preprocessed_eval"

    total_files = len(df)
    start_time = time.time()

    pbar = tqdm(df.iterrows(),
                total=total_files,
                desc="Extract LA features",
                unit="file")

    for idx, (_, row) in enumerate(pbar):
        filename = row["file"]
        label_dir = "real" if row["label"] == 1 else "fake"
        in_path   = audio_root / filename

        y, _ = librosa.load(str(in_path), sr=sr)
        y = preprocess_audio(y, sr, target_duration,
                             apply_preemphasis, coef, normalise)

        feats = {
            "mel_spectrogram":librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128),
            "mfcc":librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20),
            "chroma":librosa.feature.chroma_stft(y=y, sr=sr),
            "tonnetz":           librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr),
            "spectral_contrast": librosa.feature.spectral_contrast(y=y, sr=sr),
            "spectral_centroid": librosa.feature.spectral_centroid(y=y, sr=sr),
            "pitch":             librosa.yin(y, fmin=50, fmax=300, sr=sr),
            "energy":            librosa.feature.rms(y=y),
            "zcr":               librosa.feature.zero_crossing_rate(y=y),
            "onset_strength":    librosa.onset.onset_strength(y=y, sr=sr)
        }

        npy_name = Path(filename).stem + ".npy"
        for feat_name, arr in feats.items():
            out_dir = output_base / label_dir / feat_name
            out_dir.mkdir(parents=True, exist_ok=True)
            np.save(out_dir / npy_name, arr.astype(np.float32))

        elapsed = time.time() - start_time
        avg_per_file = elapsed / (idx + 1)
        remaining = total_files - (idx + 1)
        eta = remaining * avg_per_file
        pbar.set_postfix_str(f"ETA {eta:.0f}s")

    pbar.close()

"""
extract_and_save_la_features_with_eta(
    meta_csv="datasets/evaluation/LA/LA_eval_meta.csv",
    audio_root="datasets/evaluation/LA/ASVspoof2019_LA_eval/flac",
    output_root="datasets/evaluation/LA",
    sr=22050,
    target_duration=6.0,
    apply_preemphasis=False,
    coef=0.5,
    normalise="rms",
)
"""


In [None]:
# ===============================================================
# Dataset function for loading preprocessed audio features.
# 
# - Loads feature data from .npy files (one per feature per file)
# - Supports 10 audio features (e.g. MFCC, chroma, ZCR, etc.)
# - Automatically pads or crops features to uniform shape
# - Returns all features as tensors + binary label (real/fake)
# ===============================================================

class AudioFeatureDataset(Dataset):
    def __init__(self, meta_csv, feature_root,
                 features=[
                     'mel_spectrogram','mfcc','chroma','tonnetz',
                     'spectral_contrast','spectral_centroid',
                     'pitch','energy','zcr','onset_strength'
                 ],
                 target_shape=(128, 259)):
        
        # Load the metadata CSV into a dataframe
        self.df = pd.read_csv(meta_csv)
        
        # Path where preprocessed features are stored
        self.feature_root = Path(feature_root)
        
        # List of audio features to load for each sample
        self.features = features
        
        # Desired shape for each feature tensor
        self.target_shape = target_shape

        # Ensure label is int (1 = real, 0 = fake)
        self.df["label"] = self.df["label"].astype(int)
        #test

    def __len__(self):
        # Return total number of samples
        return len(self.df)

    def _pad_or_resize(self, tensor, target_shape):
        # Pad or crop tensor to match target shape
        h, w = tensor.shape
        th, tw = target_shape
        if th is None: th = h
        if tw is None: tw = w
        pad_h = th - h
        pad_w = tw - w

        # Crop if too big, pad if too small
        if pad_h < 0 or pad_w < 0:
            return tensor[:th, :tw]
        return F.pad(tensor, (0, pad_w, 0, pad_h))  # Pad (left, right, top, bottom)

    def __getitem__(self, idx):
        # Retrieve the row from metadata
        row = self.df.iloc[idx]

        # File name without extension + .npy
        file_id = Path(row["file"]).stem + ".npy"

        # Determine folder based on label
        label_dir = "real" if row["label"] == 1 else "fake"

        feature_tensors = []

        # Loop through each feature and load corresponding .npy
        for feat in self.features:
            path = self.feature_root / "preprocessed_eval" / label_dir / feat / file_id
            if not path.exists():
                raise FileNotFoundError(f"Missing {path}")
            
            # Load .npy feature file as tensor
            arr = np.load(path)
            t = torch.tensor(arr, dtype=torch.float32)

            # If 1D, unsqueeze to make it 2D (e.g. (n,) -> (1, n))
            if t.dim() == 1:
                t = t.unsqueeze(0)

            # Ensure shape consistency
            t = self._pad_or_resize(t, self.target_shape)

            # Collect feature tensor
            feature_tensors.append(t)

        # Return tuple of (features..., label)
        return (*feature_tensors, torch.tensor(row["label"], dtype=torch.float32))



In [None]:
# ===============================================================
# Evaluates trained binary deepfake detection model on a test dataset.
# - Computes: Accuracy, Precision, Recall, F1 Score, AUC, and EER
# - Displays a Confusion Matrix
# - Takes: a model, test dataset, and runs inference using no gradients
# ===============================================================


def evaluate_on_test_set(model, test_dataset,
                         batch_size=32,
                         device='cuda' if torch.cuda.is_available() else 'cpu'):
    loader = DataLoader(test_dataset, batch_size=batch_size)
    model.eval()
    model.to(device)

    all_probs, all_preds_05, all_labels = [], [], []

    with torch.no_grad():
        for batch in tqdm(loader, desc="Testing"):
            *features, labels = batch
            features = [f.to(device) for f in features]
            labels = labels.to(device).unsqueeze(1)
            outputs = model(*features)
            probs = torch.sigmoid(outputs).cpu().numpy().squeeze()
            preds_05 = (probs > 0.5).astype(int)

            all_probs.extend(probs.tolist())
            all_preds_05.extend(preds_05.tolist())
            all_labels.extend(labels.cpu().numpy().astype(int).tolist())

    all_labels = np.array(all_labels)
    all_probs  = np.array(all_probs)
    all_preds_05 = np.array(all_preds_05)

    # ROC and EER
    fpr, tpr, thresholds = roc_curve(all_labels, all_probs)
    auc = roc_auc_score(all_labels, all_probs)
    eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)

    # Optimal threshold based on max(TPR - FPR)
    optimal_idx = np.argmax(tpr - fpr)
    optimal_threshold = thresholds[optimal_idx]
    preds_optimal = (all_probs > optimal_threshold).astype(int)

    print("\n--- Evaluation at Threshold = 0.5 ---")
    print(f"Accuracy:  {accuracy_score(all_labels, all_preds_05):.4f}")
    print(f"Precision: {precision_score(all_labels, all_preds_05, zero_division=0):.4f}")
    print(f"Recall:    {recall_score(all_labels, all_preds_05, zero_division=0):.4f}")
    print(f"F1 Score:  {f1_score(all_labels, all_preds_05, zero_division=0):.4f}")

    print("\n--- Evaluation at Optimal Threshold ---")
    print(f"Optimal Threshold: {optimal_threshold:.4f}")
    print(f"Accuracy:  {accuracy_score(all_labels, preds_optimal):.4f}")
    print(f"Precision: {precision_score(all_labels, preds_optimal, zero_division=0):.4f}")
    print(f"Recall:    {recall_score(all_labels, preds_optimal, zero_division=0):.4f}")
    print(f"F1 Score:  {f1_score(all_labels, preds_optimal, zero_division=0):.4f}")

    print(f"\nAUC:       {auc:.4f}")
    print(f"EER:       {eer:.4f}")

    # Confusion matrix for optimal threshold
    cm = confusion_matrix(all_labels, preds_optimal)
    disp = ConfusionMatrixDisplay(cm, display_labels=["spoof", "bona-fide"])
    disp.plot(cmap=plt.cm.Blues)
    plt.title("Confusion Matrix @ Optimal Threshold")
    plt.tight_layout()
    plt.show()

    # ROC Curve
    eer_x = eer
    eer_y = 1 - eer

    plt.figure(figsize=(6, 5))
    plt.plot(fpr, tpr, label=f"AUC = {auc:.4f}")
    plt.plot([0, 1], [0, 1], linestyle='--', color='grey', label="Random Guess")
    plt.plot(eer_x, eer_y, 'ro', label=f"EER = {eer:.4f}")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend(loc="lower right")
    plt.grid(True)
    plt.tight_layout()
    plt.show()



    


In [None]:
# Running evalution loop on eval data fro LA dataset

model = AudioDeepfakeFusionModel()
model.load_state_dict(
    torch.load("model-weights/df_model.pth", map_location="cpu")
)

dataset = AudioFeatureDataset(
    meta_csv="datasets/evaluation/LA/LA_eval_meta.csv",
    feature_root="datasets/evaluation/LA",
    features=[
        'mfcc', 'chroma', 'tonnetz', 'spectral_contrast',
        'pitch', 'energy', 'zcr', 'onset_strength', 'spectral_centroid', 'mel_spectrogram'
    ]
)



debug_loader = DataLoader(dataset, batch_size=32, shuffle=False)
batch = next(iter(debug_loader))

evaluate_on_test_set(model, dataset, batch_size=32)
