In [33]:
import os
import numpy as np
import pandas as pd
from glob import glob
import librosa
import torch
from tqdm import tqdm
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_recall_curve, auc, roc_curve
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

# Initialize paths and parameters
model_name = "facebook/wav2vec2-large-xlsr-53"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name)

train_audio_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\train\\con_wav\\*.wav"
segment_label_path = r"F:\Awais_data\Datasets\PartialSpoof\database_segment_labels\database\segment_labels\train_seglab_0.16.npy"
utterance_label_path = r"F:\Awais_data\Datasets\PartialSpoof\protocols\PartialSpoof_LA_cm_protocols\PartialSpoof.LA.cm.train.trl.txt"
save_path = r"C:\\Notebooks\\rrl_source\\dataset_raw\\merge\\new\\"

window_size = 0.16 # Segment length in seconds
hop_size = 0.16    # Frame shift in seconds
extract_utterance_level = False  # Set to True for utterance-level, False for segment-level

# Hann window function
def hann_window(length):
    return 0.5 * (1 - np.cos(2 * np.pi * np.arange(length) / (length - 1)))

# Load segment-level or utterance-level labels
def load_labels(label_file, utterance_level):
    labels = {}
    
    if utterance_level:
        # For utterance-level labels, we parse the label file and map the class labels to file_ids
        with open(label_file, 'r') as file:
            for line in file:
                parts = line.strip().split(' ')
                if len(parts) >= 5:  # Assuming the line structure is consistent
                    file_id = parts[1].strip()  # Extract the file_id
                    label = parts[-1].strip()   # Extract the label (e.g., "spoof" or "bonafide")
                    # Assign 0 for spoof, 1 for real/bonafide
                    labels[file_id] = 0 if label == 'spoof' else 1
    else:
        # If working with segment-level labels
        labels = np.load(label_file, allow_pickle=True).item()

    return labels

def extract_handcrafted_features(segment, sr, target_size=60):
    windowed_segment = segment * hann_window(len(segment))
    mfcc = librosa.feature.mfcc(y=windowed_segment, sr=sr, n_mfcc=13).mean(axis=1)
    delta_mfcc = librosa.feature.delta(mfcc)
    tempo = librosa.beat.tempo(y=windowed_segment, sr=sr)[0]
    chroma = librosa.feature.chroma_stft(y=windowed_segment, sr=sr).mean(axis=1)
    zcr = librosa.feature.zero_crossing_rate(windowed_segment).mean()
    energy = librosa.feature.rms(y=windowed_segment).mean()
    pitches, _ = librosa.core.piptrack(y=windowed_segment, sr=sr)
    pitch = np.mean(pitches[pitches > 0]) if len(pitches[pitches > 0]) > 0 else 0
    tempogram = librosa.feature.tempogram(y=windowed_segment, sr=sr).mean(axis=1)
    downsampled_tempogram = tempogram[::int(np.ceil(len(tempogram) / 18))]
    
    features = np.concatenate((mfcc, delta_mfcc, [tempo], chroma, [zcr], [energy], [pitch], downsampled_tempogram))
    
    # Pad or truncate features to the target size
    if len(features) < target_size:
        features = np.pad(features, (0, target_size - len(features)), mode='constant')
    elif len(features) > target_size:
        features = features[:target_size]
    
    return features

# Wav2Vec2 feature extraction function
def extract_wav2vec_features(segment, sr, target_size=1024):
    if len(segment.shape) > 1:   # Ensure the segment is a 1D array and not a 2D array
        segment = segment.flatten()  # Flatten it to 1D if needed
    
    inputs = feature_extractor(segment, sampling_rate=sr, return_tensors="pt", padding=False)
    
    with torch.no_grad():
        outputs = model(**inputs)  # Extract the last hidden state and return it
    wav2vec_features = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    
    # Pad or truncate to fixed size (1024)
    if len(wav2vec_features) < target_size:
        wav2vec_features = np.pad(wav2vec_features, (0, target_size - len(wav2vec_features)), mode='constant')
    elif len(wav2vec_features) > target_size:
        wav2vec_features = wav2vec_features[:target_size]
    
    return wav2vec_features

# Main processing function
def process_dataset(audio_files, labels, utterance_level):
    all_features = []
    all_labels = []
    for audio_file in tqdm(audio_files, desc="Processing files"):
        file_id = os.path.basename(audio_file).replace('.wav', '')
        if file_id not in labels:
            print('no file id exist')
            continue

        y, sr = librosa.load(audio_file, sr=16000)
        y = librosa.util.fix_length(y, size=int(4 * sr))  # Truncate or pad to 4 seconds
  # Truncate or pad to 4 seconds
        num_frames = int(4 / hop_size)
        
        if utterance_level:
            label = labels[file_id]
            for _ in range(num_frames):
                handcrafted_features = extract_handcrafted_features(y, sr)
                wav2vec_features = extract_wav2vec_features(y, sr)
                combined_features = np.concatenate((handcrafted_features, wav2vec_features))
                all_features.append(combined_features)
                all_labels.append(label)
        else:
            segment_labels = labels[file_id]
            segment_length = int(window_size * sr)
            hop_length = int(hop_size * sr)
            for i in range(num_frames):
                start = i * hop_length
                end = start + segment_length
                segment = y[start:end]
                handcrafted_features = extract_handcrafted_features(segment, sr)
                wav2vec_features = extract_wav2vec_features(segment, sr)
                combined_features = np.concatenate((handcrafted_features, wav2vec_features))
                all_features.append(combined_features)
                all_labels.append(segment_labels[i])

    return np.array(all_features), np.array(all_labels)

# Load labels and process dataset for 100 spoof and 100 real audios
labels = load_labels(utterance_label_path if extract_utterance_level else segment_label_path, extract_utterance_level)

audio_files = glob(train_audio_path)
print(len(audio_files))

# Extract file IDs (for matching with labels)
audio_file_ids = [os.path.basename(file).replace('.wav', '') for file in audio_files]
# print(audio_file_ids[:10])  # Print first 10 file IDs

# Now, match the files based on file_id and their labels
# spoof_files = [file for file in audio_files if os.path.basename(file).replace('.wav', '') in labels and labels[os.path.basename(file).replace('.wav', '')] == 0]
# print(len(spoof_files))

# real_files = [file for file in audio_files if os.path.basename(file).replace('.wav', '') in labels and labels[os.path.basename(file).replace('.wav', '')] == 1]
# print(len(real_files))




spoof_files = []
real_files = []
for file in audio_files:
    file_id = os.path.basename(file).replace('.wav', '')
    
    if file_id not in labels:
        print(f"File ID {file_id} not found in labels.")
        continue
    
    if extract_utterance_level:
        # For utterance-level labels
        label = labels[file_id]
        if label == 0:
            spoof_files.append(file)
        elif label == 1:
            real_files.append(file)
    else:
        # For segment-level labels
        segment_labels = labels[file_id]
        # Here, check for segment-level processing logic
        if int(segment_labels[0]) == 0:
            spoof_files.append(file)
        elif int(segment_labels[0]) == 1:
            real_files.append(file)

print(f"Number of spoof files: {len(spoof_files)}")
print(f"Number of real files: {len(real_files)}")
# selected_spoof_files = spoof_files  # Use all available spoof files
# selected_real_files = real_files  # Use all available real files

print(f"Length of segment_labels: {len(segment_labels)}")
print(f"Indexing at: {i}")
if i >= len(segment_labels):
    print(f"Skipping index {i} because it's out of bounds.")
else:
    all_labels.append(segment_labels[i])

# Select 100 spoof audios from the start and 100 real audios from the end
selected_spoof_files = spoof_files[:10]  # Selecting the first 100 spoof files
selected_real_files = real_files[-10:]  # Selecting the last 100 real files

# Print names and labels of selected audios
for audio_file in selected_spoof_files:
    file_id = os.path.basename(audio_file).replace('.wav', '')
    # print(f"Selected spoof audio: {file_id} - Label: {labels[file_id]}")

for audio_file in selected_real_files:
    file_id = os.path.basename(audio_file).replace('.wav', '')
    # print(f"Selected real audio: {file_id} - Label: {labels[file_id]}")

# Combine the selected files
selected_audio_files = selected_spoof_files + selected_real_files

# Process the dataset
features, labels = process_dataset(selected_audio_files, labels, extract_utterance_level)

# Split dataset into train, validation, and test sets (80-10-10 split)
X_train, X_temp, y_train, y_temp = train_test_split(features, labels, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)

# Standardize the features (important for MLP performance)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Train an MLP classifier
mlp = MLPClassifier(hidden_layer_sizes=(512, 256), max_iter=300, random_state=42)
mlp.fit(X_train, y_train)

# Evaluate the model
y_pred = mlp.predict(X_test)
y_pred_prob = mlp.predict_proba(X_test)[:, 1]  # Probabilities for ROC AUC and EER

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)

# ROC AUC
roc_auc = roc_auc_score(y_test, y_pred_prob)

# EER calculation
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_prob)
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
eer = fpr[np.nanargmin(np.abs(fpr - (1 - tpr)))]  # EER is where fpr = 1 - tpr

print(f"Confusion Matrix:\n{cm}")
print(f"ROC AUC: {roc_auc:.4f}")
print(f"EER: {eer:.4f}")

# Save evaluation results to a .txt file
evaluation_file = "C:\\Notebooks\\rrl_source\\dataset_raw\\merge\\new\\evaluation_results.txt"
with open(evaluation_file, "w") as file:
    file.write(f"Confusion Matrix:\n{cm}\n")
    file.write(f"ROC AUC: {roc_auc:.4f}\n")
    file.write(f"EER: {eer:.4f}\n")

# Save frame-level predictions to a .csv file
predictions_df = pd.DataFrame({
    'File ID': selected_audio_files,
    'Prediction': y_pred,
    'Prediction Score': y_pred_prob
})
predictions_file = "C:\\Notebooks\\rrl_source\\dataset_raw\\merge\\new\\frame_level_predictions.csv"
predictions_df.to_csv(predictions_file, index=False)

print(f"Evaluation results and predictions saved to {evaluation_file} and {predictions_file}.")

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


25380
Number of spoof files: 20518
Number of real files: 4862
Length of segment_labels: 17
Indexing at: 24
Skipping index 24 because it's out of bounds.


	This function was moved to 'librosa.feature.rhythm.tempo' in librosa version 0.10.0.
	This alias will be removed in librosa version 1.0.
  tempo = librosa.beat.tempo(y=windowed_segment, sr=sr)[0]
Processing files:   0%|                                                                         | 0/20 [00:03<?, ?it/s]


IndexError: index 17 is out of bounds for axis 0 with size 17

In [19]:
import os
import numpy as np
import librosa
from glob import glob

# Function to load segment-level or utterance-level labels
def load_labels(label_file, utterance_level):
    labels = {}
    
    if utterance_level:
        # For utterance-level labels, parse the label file and map file_ids to class labels
        with open(label_file, 'r') as file:
            for line in file:
                parts = line.strip().split(' ')
                if len(parts) >= 5:  # Assuming the line structure is consistent
                    file_id = parts[1].strip()  # Extract file_id
                    label = parts[-1].strip()   # Extract label ("spoof" or "bonafide")
                    labels[file_id] = 0 if label == 'spoof' else 1
    else:
        # For segment-level labels, load from the numpy file
        labels = np.load(label_file, allow_pickle=True).item()

    return labels

# Function to calculate the number of frames for a given audio based on window_size and hop_size
def get_num_frames(audio_length, window_size, hop_size, sr):
    segment_length = int(window_size * sr)
    hop_length = int(hop_size * sr)
    return int((audio_length - segment_length) / hop_length) + 1

# Paths for dataset and labels
train_audio_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\train\\con_wav\\*.wav"
segment_label_path = r"F:\Awais_data\Datasets\PartialSpoof\database_segment_labels\database\segment_labels\train_seglab_0.16.npy"
utterance_label_path = r"F:\Awais_data\Datasets\PartialSpoof\protocols\PartialSpoof_LA_cm_protocols\PartialSpoof.LA.cm.train.trl.txt"

# Parameters
window_size = 0.16  # Segment length in seconds
hop_size = 0.15    # Frame shift in seconds
extract_utterance_level = True  # Set to True for utterance-level, False for segment-level

# Load labels
labels = load_labels(utterance_label_path if extract_utterance_level else segment_label_path, extract_utterance_level)

# Load 3 random audio files from the dataset
audio_files = glob(train_audio_path)
random_files = np.random.choice(audio_files, 3, replace=False)

# Process each audio file
for audio_file in random_files:
    file_id = os.path.basename(audio_file).replace('.wav', '')
    
    if file_id not in labels:
        print(f"File ID {file_id} not found in labels")
        continue

    y, sr = librosa.load(audio_file, sr=16000)
    audio_length = len(y)

    # Get the number of frames for this audio file
    num_frames = get_num_frames(audio_length, window_size, hop_size, sr)

    # Get the corresponding labels for the audio file
    if extract_utterance_level:
        label = labels[file_id]
        segment_labels = [label] * num_frames
    else:
        segment_labels = labels[file_id]

    # Print the details
    print(f"Audio File: {file_id}")
    print(f"Number of frames generated: {num_frames}")
    print(f"Number of labels in the segment label file: {len(segment_labels)}")
    print(f"Labels: {segment_labels[:50]}...")  # Print first 5 labels as an example
    print('-' * 50)


Audio File: LA_T_6741908
Number of frames generated: 15
Number of labels in the segment label file: 15
Labels: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]...
--------------------------------------------------
Audio File: CON_T_0008091
Number of frames generated: 18
Number of labels in the segment label file: 18
Labels: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]...
--------------------------------------------------
Audio File: CON_T_0005915
Number of frames generated: 31
Number of labels in the segment label file: 31
Labels: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]...
--------------------------------------------------


In [3]:
import os
import numpy as np
import pandas as pd
from glob import glob
import librosa
import torch
from tqdm import tqdm
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_recall_curve, auc, roc_curve
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import torch

# Determine if a GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Initialize paths and parameters
model_name = "facebook/wav2vec2-large-xlsr-53"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name).to(device)

train_audio_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\train\\con_wav\\*.wav"
segment_label_path = r"F:\Awais_data\Datasets\PartialSpoof\database_segment_labels\database\segment_labels\train_seglab_0.16.npy"
utterance_label_path = r"F:\Awais_data\Datasets\PartialSpoof\protocols\PartialSpoof_LA_cm_protocols\PartialSpoof.LA.cm.train.trl.txt"
save_path = r"C:\\Notebooks\\rrl_source\\dataset_raw\\merge\\new\\"

window_size = 0.16 # Segment length in seconds
hop_size = 0.16    # Frame shift in seconds
extract_utterance_level = True  # Set to True for utterance-level, False for segment-level

# Hann window function
def hann_window(length):
    return 0.5 * (1 - np.cos(2 * np.pi * np.arange(length) / (length - 1)))

# Load segment-level or utterance-level labels
def load_labels(label_file, utterance_level):
    labels = {}
    
    if utterance_level:
        # For utterance-level labels, we parse the label file and map the class labels to file_ids
        with open(label_file, 'r') as file:
            for line in file:
                parts = line.strip().split(' ')
                if len(parts) >= 5:  # Assuming the line structure is consistent
                    file_id = parts[1].strip()  # Extract the file_id
                    label = parts[-1].strip()   # Extract the label (e.g., "spoof" or "bonafide")
                    # Assign 0 for spoof, 1 for real/bonafide
                    labels[file_id] = 0 if label == 'spoof' else 1
    else:
        # If working with segment-level labels
        labels = np.load(label_file, allow_pickle=True).item()

    return labels

def extract_handcrafted_features(segment, sr, target_size=60):
    windowed_segment = segment * hann_window(len(segment))
    mfcc = librosa.feature.mfcc(y=windowed_segment, sr=sr, n_mfcc=13).mean(axis=1)
    delta_mfcc = librosa.feature.delta(mfcc)
    # tempo = librosa.feature.rhythm.tempo(y=windowed_segment, sr=sr)[0]
    tempo = librosa.beat.tempo(y=windowed_segment, sr=sr)[0]
    chroma = librosa.feature.chroma_stft(y=windowed_segment, sr=sr).mean(axis=1)
    zcr = librosa.feature.zero_crossing_rate(windowed_segment).mean()
    energy = librosa.feature.rms(y=windowed_segment).mean()
    pitches, _ = librosa.core.piptrack(y=windowed_segment, sr=sr)
    pitch = np.mean(pitches[pitches > 0]) if len(pitches[pitches > 0]) > 0 else 0
    tempogram = librosa.feature.tempogram(y=windowed_segment, sr=sr).mean(axis=1)
    downsampled_tempogram = tempogram[::int(np.ceil(len(tempogram) / 18))]
    
    features = np.concatenate((mfcc, delta_mfcc, [tempo], chroma, [zcr], [energy], [pitch], downsampled_tempogram))
    
    # Pad or truncate features to the target size
    if len(features) < target_size:
        features = np.pad(features, (0, target_size - len(features)), mode='constant')
    elif len(features) > target_size:
        features = features[:target_size]    
    return features

def extract_wav2vec_features(segment, sr, target_size=1024):
    if len(segment.shape) > 1:   # Ensure the segment is a 1D array and not a 2D array
        segment = segment.flatten()  # Flatten it to 1D if needed
    
    inputs = feature_extractor(segment, sampling_rate=sr, return_tensors="pt", padding=False).to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)  # Extract the last hidden state
    wav2vec_features = outputs.last_hidden_state.mean(dim=1).cpu().squeeze().numpy()
    
    # Pad or truncate to fixed size (1024)
    if len(wav2vec_features) < target_size:
        wav2vec_features = np.pad(wav2vec_features, (0, target_size - len(wav2vec_features)), mode='constant')
    elif len(wav2vec_features) > target_size:
        wav2vec_features = wav2vec_features[:target_size]
    
    return wav2vec_features

# Main processing function
def process_dataset(audio_files, labels, utterance_level):
    all_features = []
    all_labels = []
    for audio_file in tqdm(audio_files, desc="Processing files"):
        file_id = os.path.basename(audio_file).replace('.wav', '')
        if file_id not in labels:
            print('no file id exist')
            continue

        y, sr = librosa.load(audio_file, sr=16000)
        y = librosa.util.fix_length(y, size=int(4 * sr))  # Truncate or pad to 4 seconds
  # Truncate or pad to 4 seconds
        num_frames = int(4 / hop_size)
        
        if utterance_level:
            label = labels[file_id]
            for _ in range(num_frames):
                handcrafted_features = extract_handcrafted_features(y, sr)
                wav2vec_features = extract_wav2vec_features(y, sr)
                combined_features = np.concatenate((handcrafted_features, wav2vec_features))
                all_features.append(combined_features)
                all_labels.append(label)
        else:
            segment_labels = labels[file_id]
            segment_length = int(window_size * sr)
            hop_length = int(hop_size * sr)
            for i in range(num_frames):
                start = i * hop_length
                end = start + segment_length
                segment = y[start:end]
                handcrafted_features = extract_handcrafted_features(segment, sr)
                wav2vec_features = extract_wav2vec_features(segment, sr)
                combined_features = np.concatenate((handcrafted_features, wav2vec_features))
                all_features.append(combined_features)
                all_labels.append(segment_labels[i])

    return np.array(all_features), np.array(all_labels)

# Load labels and process dataset for 100 spoof and 100 real audios
labels = load_labels(utterance_label_path if extract_utterance_level else segment_label_path, extract_utterance_level)

audio_files = glob(train_audio_path)
print(len(audio_files))

# Extract file IDs (for matching with labels)
audio_file_ids = [os.path.basename(file).replace('.wav', '') for file in audio_files]
# print(audio_file_ids[:10])  # Print first 10 file IDs

# Now, match the files based on file_id and their labels
spoof_files = [file for file in audio_files if os.path.basename(file).replace('.wav', '') in labels and labels[os.path.basename(file).replace('.wav', '')] == 0]
print(len(spoof_files))

real_files = [file for file in audio_files if os.path.basename(file).replace('.wav', '') in labels and labels[os.path.basename(file).replace('.wav', '')] == 1]
print(len(real_files))

# Select 100 spoof audios from the start and 100 real audios from the end
# selected_spoof_files = spoof_files[:50]  # Selecting the first 100 spoof files
# selected_real_files = real_files[-25:]  # Selecting the last 100 real files

selected_spoof_files = spoof_files  # Use all available spoof files
selected_real_files = real_files  # Use all available real files

# Print names and labels of selected audios
for audio_file in selected_spoof_files:
    file_id = os.path.basename(audio_file).replace('.wav', '')
    # print(f"Selected spoof audio: {file_id} - Label: {labels[file_id]}")

for audio_file in selected_real_files:
    file_id = os.path.basename(audio_file).replace('.wav', '')
    # print(f"Selected real audio: {file_id} - Label: {labels[file_id]}")

# Combine the selected files
selected_audio_files = selected_spoof_files + selected_real_files

# Process the dataset
features, labels = process_dataset(selected_audio_files, labels, extract_utterance_level)

# Split dataset into train, validation, and test sets (80-10-10 split)
X_train, X_temp, y_train, y_temp = train_test_split(features, labels, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)

# Standardize the features (important for MLP performance)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Train an MLP classifier
mlp = MLPClassifier(hidden_layer_sizes=(512, 256), max_iter=300, random_state=42)
mlp.fit(X_train, y_train)

# Evaluate the model
y_pred = mlp.predict(X_test)
y_pred_prob = mlp.predict_proba(X_test)[:, 1]  # Probabilities for ROC AUC and EER

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)

# ROC AUC
roc_auc = roc_auc_score(y_test, y_pred_prob)

# EER calculation
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_prob)
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
eer = fpr[np.nanargmin(np.abs(fpr - (1 - tpr)))]  # EER is where fpr = 1 - tpr

print(f"Confusion Matrix:\n{cm}")
print(f"ROC AUC: {roc_auc:.4f}")
print(f"EER: {eer:.4f}")

# Save evaluation results to a .txt file
evaluation_file = "C:\\Notebooks\\rrl_source\\dataset_raw\\merge\\new\\evaluation_results.txt"
with open(evaluation_file, "w") as file:
    file.write(f"Confusion Matrix:\n{cm}\n")
    file.write(f"ROC AUC: {roc_auc:.4f}\n")
    file.write(f"EER: {eer:.4f}\n")

# Save frame-level predictions to a .csv file
predictions_df = pd.DataFrame({
    'File ID': selected_audio_files,
    'Prediction': y_pred,
    'Prediction Score': y_pred_prob
})
predictions_file = "C:\\Notebooks\\rrl_source\\dataset_raw\\merge\\new\\frame_level_predictions.csv"
predictions_df.to_csv(predictions_file, index=False)

print(f"Evaluation results and predictions saved to {evaluation_file} and {predictions_file}.")



Using device: cuda


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


25380
22800
2580


	This function was moved to 'librosa.feature.rhythm.tempo' in librosa version 0.10.0.
	This alias will be removed in librosa version 1.0.
  tempo = librosa.beat.tempo(y=windowed_segment, sr=sr)[0]
Processing files: 100%|███████████████████████████████████████████████████████| 25380/25380 [39:58:28<00:00,  5.67s/it]


Confusion Matrix:
[[22727     0]
 [    0  2653]]
ROC AUC: 1.0000
EER: 0.0000
Evaluation results and predictions saved to C:\Notebooks\rrl_source\dataset_raw\merge\new\evaluation_results.txt and C:\Notebooks\rrl_source\dataset_raw\merge\new\frame_level_predictions.csv.


In [1]:
import os
import numpy as np
import pandas as pd
from glob import glob
import librosa
import torch
from tqdm import tqdm
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_recall_curve, auc, roc_curve
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from scipy.optimize import brentq
from scipy.interpolate import interp1d

# Determine if a GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Initialize paths and parameters
model_name = "facebook/wav2vec2-large-xlsr-53"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name).to(device)

train_audio_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\train\\con_wav\\*.wav"
train_utterance_label_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\protocols\\PartialSpoof_LA_cm_protocols\\PartialSpoof.LA.cm.train.trl.txt"
train_segment_label_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\database_segment_labels\\database\\segment_labels\\train_seglab_0.16.npy"

val_audio_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\dev\\con_wav\\*.wav"
val_utterance_label_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\protocols\\PartialSpoof_LA_cm_protocols\\PartialSpoof.LA.cm.dev.trl.txt"
val_segment_label_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\database_segment_labels\\database\\segment_labels\\dev_seglab_0.16.npy"

test_audio_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\eval\\con_wav\\*.wav"
test_utterance_label_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\protocols\\PartialSpoof_LA_cm_protocols\\PartialSpoof.LA.cm.eval.trl.txt"
test_segment_label_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\database_segment_labels\\database\\segment_labels\\eval_seglab_0.16.npy"

window_size = 0.16  # Segment length in seconds
hop_size = 0.16     # Frame shift in seconds
   # Target feature size

# Function to load labels
def load_labels(label_file, utterance_level):
    labels = {}
    if utterance_level:
        with open(label_file, 'r') as file:
            for line in file:
                parts = line.strip().split(' ')
                if len(parts) >= 5:  # Assuming the line structure is consistent
                    file_id = parts[1].strip()  # Extract the file_id
                    label = parts[-1].strip()   # Extract the label (e.g., "spoof" or "bonafide")
                    labels[file_id] = 0 if label == 'spoof' else 1
    else:
        labels = np.load(label_file, allow_pickle=True).item()
    return labels

# Function to balance the dataset
def balance_dataset(audio_files, labels, max_per_class):
    spoof_files = [file for file in audio_files if os.path.basename(file).replace('.wav', '') in labels and labels[os.path.basename(file).replace('.wav', '')] == 0]
    real_files = [file for file in audio_files if os.path.basename(file).replace('.wav', '') in labels and labels[os.path.basename(file).replace('.wav', '')] == 1]
    balanced_spoof_files = spoof_files[:max_per_class]
    balanced_real_files = real_files[:max_per_class]
    return balanced_spoof_files + balanced_real_files

# Main processing function
def process_dataset(audio_files, labels, utterance_level):
    all_features = []
    all_labels = []
    for audio_file in tqdm(audio_files, desc="Processing files"):
        file_id = os.path.basename(audio_file).replace('.wav', '')
        if file_id not in labels:
            continue
        y, sr = librosa.load(audio_file, sr=16000)
        y = librosa.util.fix_length(y, size=int(4 * sr))  # Truncate or pad to 4 seconds
        if utterance_level:
            label = labels[file_id]
            handcrafted_features = extract_handcrafted_features(y, sr)
            wav2vec_features = extract_wav2vec_features(y, sr)
            combined_features = np.concatenate((handcrafted_features, wav2vec_features))
            all_features.append(combined_features)
            all_labels.append(label)
    return np.array(all_features), np.array(all_labels)

# Handcrafted and Wav2Vec feature extraction functions (as defined previously)
def extract_handcrafted_features(segment, sr, target_size=60):
    windowed_segment = segment * hann_window(len(segment))
    mfcc = librosa.feature.mfcc(y=windowed_segment, sr=sr, n_mfcc=13).mean(axis=1)
    delta_mfcc = librosa.feature.delta(mfcc)
    # tempo = librosa.feature.rhythm.tempo(y=windowed_segment, sr=sr)[0]
    tempo = librosa.beat.tempo(y=windowed_segment, sr=sr)[0]
    chroma = librosa.feature.chroma_stft(y=windowed_segment, sr=sr).mean(axis=1)
    zcr = librosa.feature.zero_crossing_rate(windowed_segment).mean()
    energy = librosa.feature.rms(y=windowed_segment).mean()
    pitches, _ = librosa.core.piptrack(y=windowed_segment, sr=sr)
    pitch = np.mean(pitches[pitches > 0]) if len(pitches[pitches > 0]) > 0 else 0
    tempogram = librosa.feature.tempogram(y=windowed_segment, sr=sr).mean(axis=1)
    downsampled_tempogram = tempogram[::int(np.ceil(len(tempogram) / 18))]
    
    features = np.concatenate((mfcc, delta_mfcc, [tempo], chroma, [zcr], [energy], [pitch], downsampled_tempogram))
    
    # Pad or truncate features to the target size
    if len(features) < target_size:
        features = np.pad(features, (0, target_size - len(features)), mode='constant')
    elif len(features) > target_size:
        features = features[:target_size]    
    return features

def extract_wav2vec_features(segment, sr, target_size=1024):
    if len(segment.shape) > 1:   # Ensure the segment is a 1D array and not a 2D array
        segment = segment.flatten()  # Flatten it to 1D if needed
    
    inputs = feature_extractor(segment, sampling_rate=sr, return_tensors="pt", padding=False).to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)  # Extract the last hidden state
    wav2vec_features = outputs.last_hidden_state.mean(dim=1).cpu().squeeze().numpy()
    
    # Pad or truncate to fixed size (1024)
    if len(wav2vec_features) < target_size:
        wav2vec_features = np.pad(wav2vec_features, (0, target_size - len(wav2vec_features)), mode='constant')
    elif len(wav2vec_features) > target_size:
        wav2vec_features = wav2vec_features[:target_size]
    
    return wav2vec_features

def calculate_eer(y_true, y_scores):
    # Calculate false acceptance rate (FAR) and false rejection rate (FRR)
    fpr, tpr, thresholds = roc_curve(y_true, y_scores, pos_label=1)
    fnr = 1 - tpr
    eer_threshold = thresholds[np.nanargmin(np.absolute((fnr - fpr)))]
    eer = fpr[np.nanargmin(np.absolute((fnr - fpr)))]
    return eer, eer_threshold

# Load complete unbalanced datasets
train_files = glob(train_audio_path)
val_files = glob(val_audio_path)
test_files = glob(test_audio_path)

# Load and balance datasets
# train_files = balance_dataset(glob(train_audio_path), train_labels, max_per_class=1200)
# val_files = balance_dataset(glob(val_audio_path), val_labels, max_per_class=2500)
# test_files = balance_dataset(glob(test_audio_path), test_labels, max_per_class=2500)

# Load labels
train_labels = load_labels(train_utterance_label_path, utterance_level=True)
val_labels = load_labels(val_utterance_label_path, utterance_level=True)
test_labels = load_labels(test_utterance_label_path, utterance_level=True)

# Process datasets
X_train, y_train = process_dataset(train_files, train_labels, utterance_level=False)
X_val, y_val = process_dataset(val_files, val_labels, utterance_level=False)
X_test, y_test = process_dataset(test_files, test_labels, utterance_level=False)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Train the model
mlp = MLPClassifier(hidden_layer_sizes=(512, 256), max_iter=300, random_state=42)
mlp.fit(X_train, y_train)

# Validate the model
y_val_pred = mlp.predict(X_val)
y_val_pred_prob = mlp.predict_proba(X_val)[:, 1]

# Test the model
y_test_pred = mlp.predict(X_test)
y_test_pred_prob = mlp.predict_proba(X_test)[:, 1]

# Metrics
val_cm = confusion_matrix(y_val, y_val_pred)
test_cm = confusion_matrix(y_test, y_test_pred)
val_roc_auc = roc_auc_score(y_val, y_val_pred_prob)
test_roc_auc = roc_auc_score(y_test, y_test_pred_prob)

# EER Computation
val_eer, val_threshold = calculate_eer(y_val, y_val_pred_prob)
test_eer, test_threshold = calculate_eer(y_test, y_test_pred_prob)



print(f"Validation Confusion Matrix:\n{val_cm}")
print(f"Validation ROC AUC: {val_roc_auc:.4f}")
print(f"Validation EER: {val_eer:.4f}, Threshold: {val_threshold:.4f}\n")
print(f"Test Confusion Matrix:\n{test_cm}")
print(f"Test ROC AUC: {test_roc_auc:.4f}")
print(f"Test EER: {test_eer:.4f}, Threshold: {test_threshold:.4f}")


Using device: cuda


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


FileNotFoundError: [Errno 2] No such file or directory: 'F:\\\\Awais_data\\\\Datasets\\\\PartialSpoof\\\\protocols\\\\PartialSpoof_LA_cm_protocols\\\\PartialSpoof.LA.cm.train.trl.txt'