In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

# Read features and labels from CSV file
# def load_data_from_csv(file_path): # Load CSV file
#     df = pd.read_csv(file_path)# Extract features (excluding 'FileID', 'Polarity', and 'Label')
#     features = df.iloc[:, 1:-2].values  # Columns 2 to second-last (1024-dimensional features) # Extract labels (second-last column)
#     labels = df['Label'].values
#     return features, labels

def load_data_from_csv(file_path, samples_per_class=4000):# Load CSV file
    df = pd.read_csv(file_path) # Group by 'Label' and select only the first 'samples_per_class' rows from each group
    df_filtered = df.groupby('Label').apply(lambda x: x.head(samples_per_class)).reset_index(drop=True)# Extract features (excluding 'FileID', 'Polarity', and 'Label')
    features = df_filtered.iloc[:, 1:-2].values  # Columns 2 to second-last (1024-dimensional features)# Extract labels (second-last column)
    labels = df_filtered['Label'].values
    
    return features, labels

# Define a simple frame-level classifier (e.g., a small MLP)
class FrameLevelClassifier(nn.Module):
    def __init__(self, input_dim):
        super(FrameLevelClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 64)
        self.relu2 = nn.ReLU()
        self.output_layer = nn.Linear(64, 1)  # Binary classification: output 1 score per frame

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.output_layer(x)
        return torch.sigmoid(x)  # Ensure the output shape is (batch_size, 1)

# Function to classify and evaluate
def classify_and_evaluate(features, labels, utterance_level, model):
    model.eval()
    all_frame_scores = []
    all_final_labels = []

    for feature_set, label in zip(features, labels):
        # Convert features to tensor
        inputs = torch.tensor(feature_set, dtype=torch.float32).unsqueeze(0)  # Add batch dimension
        
        with torch.no_grad():
            frame_scores = model(inputs).numpy()  # Frame-level scores

        all_frame_scores.append(frame_scores)

        # Apply max pooling logic for utterance-level classification
        if utterance_level:
            pooled_label = 1 if np.sum(frame_scores > 0.5) > 1 else 0
            all_final_labels.append(pooled_label)
        else:
            frame_labels = (frame_scores > 0.5).astype(int)
            all_final_labels.append(frame_labels)

    return all_frame_scores, all_final_labels

# Load data from CSV
file_path = r'C:\Notebooks\rrl_source\dataset_raw\train_segment_Wav2Vec2_part1.csv'  # Update this path
features, labels = load_data_from_csv(file_path)

# Initialize and train a classifier
input_dim = 1024  # Feature dimension
model = FrameLevelClassifier(input_dim)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Convert labels to tensors
labels_tensor = torch.tensor(labels, dtype=torch.float32).unsqueeze(1)  # Add batch dimension

# Training loop
for epoch in range(10):
    model.train()
    for i in range(len(features)):
        feature_set = features[i].reshape(1, -1)  # Reshape for single input
        inputs = torch.tensor(feature_set, dtype=torch.float32)
        targets = labels_tensor[i].unsqueeze(0)  # Add batch dimension

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)  # Loss should work now since shapes match
        loss.backward()
        optimizer.step()

# Classify and evaluate
utterance_level = False  # Toggle for utterance or segment-level evaluation
frame_scores, final_labels = classify_and_evaluate(features, labels, utterance_level, model)

# Display results
for i, (scores, final_label) in enumerate(zip(frame_scores, final_labels)):
    print(f"Audio {i+1}:")
    print(f"Predicted Frame-Level Labels: {final_label}")
    print(f"Frame Scores: {scores}\n")

# Compute metrics
y_true = labels  # Assuming ground truth from the CSV
if utterance_level:
    y_pred = final_labels
    if len(np.unique(y_true)) > 1 and len(np.unique(y_pred)) > 1:
        auc_score = roc_auc_score(y_true, y_pred)
        print(f"AUC Score (Utterance-Level): {auc_score}")
    else:
        print("AUC Score is not defined due to lack of class variance.")
    cm = confusion_matrix(y_true, y_pred)
    print(f"Confusion Matrix:\n{cm}")
else:
    accuracy = accuracy_score(y_true, np.round(final_labels))
    print(f"Accuracy (Frame-Level): {accuracy}")
    cm = confusion_matrix(y_true, np.round(final_labels))
    print(f"Confusion Matrix (Frame-Level):\n{cm}")


# Load data from CSV
# file_path = r'C:\Notebooks\rrl_source\dataset_raw\train_segment_Wav2Vec2_part1.csv'  # Update this path



In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

# Define a simple frame-level classifier (e.g., a small MLP)
class FrameLevelClassifier(nn.Module):
    def __init__(self, input_dim):
        super(FrameLevelClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 64)
        self.relu2 = nn.ReLU()
        self.output_layer = nn.Linear(64, 1)  # Binary classification: output 1 score per frame

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.output_layer(x)
        return torch.sigmoid(x).squeeze()  # Sigmoid activation for binary classification

# Example function to classify and perform max pooling
def classify_and_evaluate(features, labels, utterance_level, model):
    model.eval()
    all_frame_scores = []
    all_final_labels = []
    all_frame_labels = []  # To store individual frame labels for each audio

    for feature_set, label_set in zip(features, labels):
        # Convert features to tensor
        inputs = torch.tensor(feature_set, dtype=torch.float32)
        with torch.no_grad():
            frame_scores = model(inputs).numpy()  # Frame-level scores

        all_frame_scores.append(frame_scores)

        # Apply max pooling logic for utterance-level classification
        if utterance_level:
            pooled_label = 1 if np.sum(frame_scores > 0.5) > 1 else 0
            all_final_labels.append(pooled_label)
        else:
            # Frame-level labels (using 0.5 as threshold for binary classification)
            frame_labels = (frame_scores > 0.5).astype(int)
            all_final_labels.extend(frame_labels)
            all_frame_labels.append(frame_labels)  # Save frame-level labels

    return all_frame_scores, all_final_labels, all_frame_labels

# Dummy Data Example
features = [np.random.rand(50, 1024) for _ in range(10)]  # 10 audio segments with 50 frames, 20 features each
labels = [np.random.randint(0, 2, size=50) for _ in range(10)]  # Random binary labels for frames

# Initialize and train a classifier (example training loop)
input_dim = 1024  # Example feature dimension
model = FrameLevelClassifier(input_dim)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Example training loop (adjust with real data)
for epoch in range(10):
    model.train()
    for feature_set, label_set in zip(features, labels):
        inputs = torch.tensor(feature_set, dtype=torch.float32)
        targets = torch.tensor(label_set, dtype=torch.float32)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

# Classify and evaluate
utterance_level = False  # Toggle for utterance or segment-level evaluation
frame_scores, final_labels, frame_labels = classify_and_evaluate(features, labels, utterance_level, model)

# Define the path to save the results
results_path = './results.txt'

# Store results in a file
with open(results_path, 'w') as file:
    for i, (scores, final_label, frame_label) in enumerate(zip(frame_scores, final_labels, frame_labels)):
        file.write(f"Audio {i+1}:\n")
        file.write(f"Predicted Frame-Level Labels: {frame_label}\n")
        file.write(f"Aggregated Utterance-Level Label: {final_label}\n")
        file.write(f"Frame Scores: {scores}\n\n")

# Display results in the console
for i, (scores, final_label, frame_label) in enumerate(zip(frame_scores, final_labels, frame_labels)):
    print(f"Audio {i+1}:")
    print(f"Predicted Frame-Level Labels: {frame_label}")
    print(f"Aggregated Utterance-Level Label: {final_label}")
    print(f"Frame Scores: {scores}\n")

# Compute metrics (example for pooled results)
if utterance_level:
    y_true = [np.sum(l) > 0 for l in labels]  # Assuming ground truth for utterance-level
    # Check for class imbalance in y_true and final_labels before computing AUC
    if len(np.unique(y_true)) > 1 and len(np.unique(final_labels)) > 1:
        auc_score = roc_auc_score(y_true, final_labels)
        print(f"AUC Score (Utterance-Level): {auc_score}")
    else:
        print("AUC Score is not defined as only one class is present in y_true or final_labels.")
        auc_score = None  # Or handle as needed

    # Confusion Matrix Example
    cm = confusion_matrix(y_true, final_labels)
    print(f"Confusion Matrix:\n{cm}")
else:
    # If using frame-level evaluation, print accuracy and confusion matrix
    y_true_frame = [frame_label for label_set in labels for frame_label in label_set]
    frame_labels = [frame_label for frame_scores in final_labels for frame_label in (np.array(frame_scores) > 0.5).astype(int)]
    
    accuracy = accuracy_score(y_true_frame, frame_labels)
    print(f"Accuracy (Frame-Level): {accuracy}")
    
    cm = confusion_matrix(y_true_frame, frame_labels)
    print(f"Confusion Matrix (Frame-Level):\n{cm}")


Audio 1:
Predicted Frame-Level Labels: [1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1]
Aggregated Utterance-Level Label: 1
Frame Scores: [0.63363916 0.7450234  0.48715135 0.75470126 0.72097987 0.5482122
 0.5309831  0.6241604  0.40524206 0.56832165 0.51470214 0.5186086
 0.7048439  0.7434709  0.6550613  0.64998937 0.7085075  0.5891417
 0.5748761  0.6057382  0.61231196 0.6231325  0.7605057  0.5068481
 0.42042223 0.7376006  0.53063726 0.5860724  0.7316179  0.6111641
 0.6860765  0.7492187  0.760431   0.54094803 0.6465549  0.67445576
 0.7018202  0.6244138  0.7254839  0.74242395 0.6885924  0.6384443
 0.57532847 0.5393755  0.6699966  0.6812506  0.7863876  0.686596
 0.5131256  0.54514575]

Audio 2:
Predicted Frame-Level Labels: [1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1]
Aggregated Utterance-Level Label: 1
Frame Scores: [0.6467512  0.76176375 0.4959641  0.6883033  0.45702988 0.51432

TypeError: 'numpy.int32' object is not iterable

In [23]:
import torch
import torch.nn as nn
import torch.optim as optim
import librosa
import numpy as np
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
import glob

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
# Initialize paths and parameters
model_name = "facebook/wav2vec2-large-xlsr-53"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name).to(device)

train_audio_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\train\\con_wav\\*.wav"
segment_label_path = r"F:\Awais_data\Datasets\PartialSpoof\database_segment_labels\database\segment_labels\train_seglab_0.16.npy"
utterance_label_path = r"F:\Awais_data\Datasets\PartialSpoof\protocols\PartialSpoof_LA_cm_protocols\PartialSpoof.LA.cm.train.trl.txt"
save_path = r"C:\\Notebooks\\rrl_source\\dataset_raw\\merge\\new2\\"

window_size = 0.16  # Segment length in seconds
hop_size = 0.16     # Frame shift in seconds
extract_utterance_level = False  # Set to True for utterance-level, False for segment-level


# Hann window function
def hann_window(length):
    return 0.5 * (1 - np.cos(2 * np.pi * np.arange(length) / (length - 1)))

# Load segment-level or utterance-level labels
def load_labels(label_file, utterance_level):
    if utterance_level:
        labels = {}
        with open(label_file, 'r') as file:
            for line in file:
                parts = line.strip().split(' ')
                if len(parts) >= 5:
                    file_id = parts[1].strip()
                    label = parts[-1].strip()
                    print(file_id)
                    print(label)
                    labels[file_id] = 0 if label == 'spoof' else 1
    else:
        labels = np.load(label_file, allow_pickle=True).item()
        print(len(labels))
    return labels

# Handcrafted feature extraction
def extract_handcrafted_features(segment, sr, target_size=60):
    windowed_segment = segment * hann_window(len(segment))
    mfcc = librosa.feature.mfcc(y=windowed_segment, sr=sr, n_mfcc=13).mean(axis=1)
    delta_mfcc = librosa.feature.delta(mfcc)
    tempo = librosa.beat.tempo(y=windowed_segment, sr=sr)[0]
    chroma = librosa.feature.chroma_stft(y=windowed_segment, sr=sr).mean(axis=1)
    zcr = librosa.feature.zero_crossing_rate(windowed_segment).mean()
    energy = librosa.feature.rms(y=windowed_segment).mean()
    pitches, _ = librosa.core.piptrack(y=windowed_segment, sr=sr)
    pitch = np.mean(pitches[pitches > 0]) if len(pitches[pitches > 0]) > 0 else 0
    tempogram = librosa.feature.tempogram(y=windowed_segment, sr=sr).mean(axis=1)
    downsampled_tempogram = tempogram[::int(np.ceil(len(tempogram) / 18))]
    features = np.concatenate((mfcc, delta_mfcc, [tempo], chroma, [zcr], [energy], [pitch], downsampled_tempogram))  # Pad or truncate features to the target size
    if len(features) < target_size:
        features = np.pad(features, (0, target_size - len(features)), mode='constant')
    elif len(features) > target_size:
        features = features[:target_size]
    return features

def extract_wav2vec_features(segment, sr, target_size=1024): 
    if len(segment.shape) > 1:   # Ensure the segment is a 1D array and not a 2D array
        segment = segment.flatten()  # Flatten it to 1D if needed  
    inputs = feature_extractor(segment, sampling_rate=sr, return_tensors="pt", padding=False).to(device)
    with torch.no_grad():
        outputs = model(**inputs)  # Extract the last hidden state and return it
    wav2vec_features =  outputs.last_hidden_state.mean(dim=1).cpu().squeeze().numpy()
    # Pad or truncate to fixed size (1024)
    if len(wav2vec_features) < target_size:
        wav2vec_features = np.pad(wav2vec_features, (0, target_size - len(wav2vec_features)), mode='constant')
    elif len(wav2vec_features) > target_size:
        wav2vec_features = wav2vec_features[:target_size]  
    return wav2vec_features

# Define a simple frame-level classifier (e.g., a small MLP)
class FrameLevelClassifier(nn.Module):
    def __init__(self, input_dim):
        super(FrameLevelClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 64)
        self.relu2 = nn.ReLU()
        self.output_layer = nn.Linear(64, 1)  # Binary classification: output 1 score per frame

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.output_layer(x)
        return torch.sigmoid(x).squeeze()  # Sigmoid activation for binary classification

def process_audio_files(audio_paths, labels, window_size, hop_size, feature_extractor):
    all_features = []
    all_labels = []

    for audio_path in audio_paths:
        # Load the audio file
        y, sr = librosa.load(audio_path, sr=None)  # Keep the original sample rate
        audio_id = audio_path.split("\\")[-1].replace(".wav", "")  # Strip the extension
        # print(f"Processing audio file: {audio_id}")  # Debug print
                # Get the corresponding labels for the current audio file
        file_labels = labels.get(audio_id, [])
        # print(f"Labels for {audio_id}: {file_labels}")  # Debug print
                # Check if file_labels is empty and handle appropriately
        if len(file_labels) == 0:  # Correctly check for empty labels
            print(f"Warning: No labels found for {audio_id}. Skipping this file.")
            continue  # Skip this file or handle with a default label
        # Convert window and hop sizes from seconds to samples
        window_size_samples = int(window_size * sr)
        hop_size_samples = int(hop_size * sr)
              # Process each segment in the audio file
        features = []
        segment_labels = []
        start = 0
        while start + window_size_samples <= len(y):
            end = start + window_size_samples
            segment = y[start:end]
            # Extract handcrafted and Wav2Vec2 features for this segment
            handcrafted_features = extract_handcrafted_features(segment, sr)
            wav2vec_features = extract_wav2vec_features(segment, sr)
                    # Combine features
            combined_features = np.concatenate([handcrafted_features, wav2vec_features])
            # Ensure segment length matches the expected number of features
            features.append(combined_features)

            # Assign the label from the segment labels (based on the current start index)
            label_index = int(start / hop_size_samples)

            # Ensure that the label index is within the bounds of file_labels
            if label_index < len(file_labels):
                label = file_labels[label_index]
            else:
                # Handle cases where there may not be enough labels for the segments
                label = file_labels[-1]  # Or use a default label if necessary
            segment_labels.append(label)

            # Move to the next segment
            start += hop_size_samples

        # Append the features and labels for this audio file
        all_features.append(features)
        all_labels.append(segment_labels)

    return all_features, all_labels

# Example function to classify and perform max pooling
def classify_and_evaluate(features, labels, utterance_level, model):
    model.eval()
    all_frame_scores = []
    all_final_labels = []
    all_frame_labels = []  # To store individual frame labels for each audio

    for feature_set, label_set in zip(features, labels):
        # Convert features to tensor
        inputs = torch.tensor(feature_set, dtype=torch.float32)
        with torch.no_grad():
            frame_scores = model(inputs).numpy()  # Frame-level scores

        all_frame_scores.append(frame_scores)

        # Apply max pooling logic for utterance-level classification
        if utterance_level:
            pooled_label = 1 if np.sum(frame_scores > 0.5) > 1 else 0
            all_final_labels.append(pooled_label)
        else:
            # Frame-level labels (using 0.5 as threshold for binary classification)
            frame_labels = (frame_scores > 0.5).astype(int)
            all_final_labels.extend(frame_labels)
            all_frame_labels.append(frame_labels)  # Save frame-level labels

    return all_frame_scores, all_final_labels, all_frame_labels

# Load labels
labels = load_labels(segment_label_path, extract_utterance_level)

# Process audio files and extract features
audio_paths = [f for f in glob.glob(train_audio_path)]
features, segment_labels = process_audio_files(audio_paths, labels, window_size, hop_size, feature_extractor)

# Initialize and train a classifier (example training loop)
input_dim = len(features[0])  # Length of the feature vector
model = FrameLevelClassifier(input_dim)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Example training loop (adjust with real data)
for epoch in range(10):
    model.train()
    for feature_set, label_set in zip(features, segment_labels):
        inputs = torch.tensor(feature_set, dtype=torch.float32)
        targets = torch.tensor([label_set], dtype=torch.float32)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

# Classify and evaluate
utterance_level = False  # Toggle for utterance or segment-level evaluation
frame_scores, final_labels, frame_labels = classify_and_evaluate(features, segment_labels, utterance_level, model)

# Define the path to save the results
results_path = './results.txt'

# Store results in a file
with open(results_path, 'w') as file:
    for i, (scores, final_label, frame_label) in enumerate(zip(frame_scores, final_labels, frame_labels)):
        file.write(f"Audio {i+1} - Final Label: {final_label}, Frame-level Labels: {frame_label}, Frame Scores: {scores}\n")


Using device: cuda


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


25380
Processing audio file: CON_T_0000000
Labels for CON_T_0000000: ['1' '1' '1' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '1' '1']


	This function was moved to 'librosa.feature.rhythm.tempo' in librosa version 0.10.0.
	This alias will be removed in librosa version 1.0.
  tempo = librosa.beat.tempo(y=windowed_segment, sr=sr)[0]


Processing audio file: CON_T_0000001
Labels for CON_T_0000001: ['1' '1' '1' '1' '1' '1' '1' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '1' '1' '1' '1' '1' '1']
Processing audio file: CON_T_0000002
Labels for CON_T_0000002: ['1' '1' '0' '0' '0' '0' '0' '0' '0' '1' '1' '1' '1' '1' '1' '1' '1' '1'
 '1' '1']
Processing audio file: CON_T_0000003
Labels for CON_T_0000003: ['1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '0' '0' '0' '0' '0' '1' '1' '1'
 '1' '0' '0' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1'
 '1' '1']
Processing audio file: CON_T_0000004
Labels for CON_T_0000004: ['1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '0' '0' '0' '0' '1' '1' '1' '1'
 '1' '1' '1' '1']
Processing audio file: CON_T_0000005
Labels for CON_T_0000005: ['1' '1' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '1' '1' '1' '1']
Processing audio file: CON_T_0000006
Labels for CON_T_0000006: ['0' '0' '0' '0' '0' '0' '0' '0' '0' '1' '1' '1' '1' '1' '1' '1' '1']
Processing audio file: CON_T_0000007
Labe

KeyboardInterrupt: 

In [40]:
import torch
import torch.nn as nn
import torch.optim as optim
import librosa
import numpy as np
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
import glob
import random
from sklearn.metrics import confusion_matrix, roc_curve
import os

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
# Initialize paths and parameters
model_name = "facebook/wav2vec2-large-xlsr-53"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name).to(device)

train_audio_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\train\\con_wav\\*.wav"
segment_label_path = r"F:\Awais_data\Datasets\PartialSpoof\database_segment_labels\database\segment_labels\train_seglab_0.16.npy"
utterance_label_path = r"F:\Awais_data\Datasets\PartialSpoof\protocols\PartialSpoof_LA_cm_protocols\PartialSpoof.LA.cm.train.trl.txt"
save_path = r"C:\\Notebooks\\rrl_source\\dataset_raw\\merge\\new\\new2\\"

window_size = 0.16  # Segment length in seconds
hop_size = 0.16     # Frame shift in seconds
extract_utterance_level = False  # Set to True for utterance-level, False for segment-level

# Load segment-level or utterance-level labels
def load_labels(label_file, utterance_level):
    if utterance_level:
        labels = {}
        with open(label_file, 'r') as file:
            for line in file:
                parts = line.strip().split(' ')
                if len(parts) >= 5:
                    file_id = parts[1].strip()
                    label = parts[-1].strip()
                    labels[file_id] = 0 if label == 'spoof' else 1
    else:
        labels = np.load(label_file, allow_pickle=True).item()
    return labels

# Handcrafted feature extraction
def extract_handcrafted_features(segment, sr, target_size=60):
    windowed_segment = segment * hann_window(len(segment))
    mfcc = librosa.feature.mfcc(y=windowed_segment, sr=sr, n_mfcc=13).mean(axis=1)
    delta_mfcc = librosa.feature.delta(mfcc)
    tempo = librosa.beat.tempo(y=windowed_segment, sr=sr)[0]
    chroma = librosa.feature.chroma_stft(y=windowed_segment, sr=sr).mean(axis=1)
    zcr = librosa.feature.zero_crossing_rate(windowed_segment).mean()
    energy = librosa.feature.rms(y=windowed_segment).mean()
    pitches, _ = librosa.core.piptrack(y=windowed_segment, sr=sr)
    pitch = np.mean(pitches[pitches > 0]) if len(pitches[pitches > 0]) > 0 else 0
    tempogram = librosa.feature.tempogram(y=windowed_segment, sr=sr).mean(axis=1)
    downsampled_tempogram = tempogram[::int(np.ceil(len(tempogram) / 18))]
    features = np.concatenate((mfcc, delta_mfcc, [tempo], chroma, [zcr], [energy], [pitch], downsampled_tempogram))  # Pad or truncate features to the target size
    if len(features) < target_size:
        features = np.pad(features, (0, target_size - len(features)), mode='constant')
    elif len(features) > target_size:
        features = features[:target_size]
    return features

def extract_wav2vec_features(segment, sr, target_size=1024): 
    if len(segment.shape) > 1:   # Ensure the segment is a 1D array and not a 2D array
        segment = segment.flatten()  # Flatten it to 1D if needed  
    inputs = feature_extractor(segment, sampling_rate=sr, return_tensors="pt", padding=False).to(device)
    with torch.no_grad():
        outputs = model(**inputs)  # Extract the last hidden state and return it
    wav2vec_features =  outputs.last_hidden_state.mean(dim=1).cpu().squeeze().numpy()
    # Pad or truncate to fixed size (1024)
    if len(wav2vec_features) < target_size:
        wav2vec_features = np.pad(wav2vec_features, (0, target_size - len(wav2vec_features)), mode='constant')
    elif len(wav2vec_features) > target_size:
        wav2vec_features = wav2vec_features[:target_size]  
    return wav2vec_features

def process_audio_files(audio_paths, labels, window_size, hop_size, feature_extractor):
    all_features = []
    all_labels = []

    class_0_audio = []
    class_1_audio = []

    # Split audio files into class 0 and class 1 based on their labels
    for audio_path in audio_paths:
        audio_id = audio_path.split("\\")[-1].replace(".wav", "")
        file_labels = labels.get(audio_id, [])
        
        # Check if the file has labels
        if len(file_labels) == 0:
            continue
        
        # We assume the label for the first frame of the audio file represents the class
        label = int(file_labels[0])  # Convert string labels like 'real'/'spoof' to 0/1
        if label == 0:
            class_0_audio.append(audio_path)
        elif label == 1:
            class_1_audio.append(audio_path)

    # Randomly select 10 files from each class
    class_0_selected = random.sample(class_0_audio, 2000)
    class_1_selected = random.sample(class_1_audio, 2200)
    selected_audio_files = class_0_selected + class_1_selected

    # Process the selected audio files
    for audio_path in selected_audio_files:
        y, sr = librosa.load(audio_path, sr=None)  # Keep the original sample rate
        audio_id = audio_path.split("\\")[-1].replace(".wav", "")  # Strip the extension
        
        file_labels = labels.get(audio_id, [])
        
        # Convert window and hop sizes from seconds to samples
        window_size_samples = int(window_size * sr)
        hop_size_samples = int(hop_size * sr)
        
        features = []
        segment_labels = []
        start = 0
        
        while start + window_size_samples <= len(y):
            end = start + window_size_samples
            segment = y[start:end]
            
            # Extract features
            handcrafted_features = extract_handcrafted_features(segment, sr)
            wav2vec_features = extract_wav2vec_features(segment, sr)
            
            combined_features = np.concatenate([handcrafted_features, wav2vec_features])
            features.append(combined_features)
            
            # Get the segment label (for simplicity, using the first label for the entire audio file)
            label_index = int(start / hop_size_samples)
            if label_index < len(file_labels):
                label = file_labels[label_index]
            else:
                label = file_labels[-1]  # Fallback in case there are not enough labels
            segment_labels.append(int(label))  # Ensure the label is an integer
            
            start += hop_size_samples

        all_features.append(features)
        all_labels.append(segment_labels)

    return all_features, all_labels

# Load labels
labels = load_labels(segment_label_path, extract_utterance_level)

# Process audio files and extract features
audio_paths = [f for f in glob.glob(train_audio_path)]
features, segment_labels = process_audio_files(audio_paths, labels, window_size, hop_size, feature_extractor)

# Initialize and train a classifier (example training loop)
# input_dim = len(features[0][0])  # Length of the feature vector for the first segment
# model = FrameLevelClassifier(input_dim)
# criterion = nn.BCELoss()
# optimizer = optim.Adam(model.parameters(), lr=0.01)

# # Example training loop (adjust with real data)
# for epoch in range(10):
#     model.train()
#     for feature_set, label_set in zip(features, segment_labels):
#         inputs = torch.tensor(feature_set, dtype=torch.float32)
        
#         # Ensure targets (labels) are numeric values (0 or 1)
#         targets = torch.tensor(label_set, dtype=torch.float32)

#         optimizer.zero_grad()
#         outputs = model(inputs)
#         loss = criterion(outputs, targets)
#         loss.backward()
#         optimizer.step()
# Initialize model
input_dim = len(features[0][0])  # Length of the feature vector for the first segment
model = FrameLevelClassifier(input_dim)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

def calculate_eer(y_true, y_scores):
    # Calculate false acceptance rate (FAR) and false rejection rate (FRR)
    fpr, tpr, thresholds = roc_curve(y_true, y_scores, pos_label=1)
    fnr = 1 - tpr
    eer_threshold = thresholds[np.nanargmin(np.absolute((fnr - fpr)))]
    eer = fpr[np.nanargmin(np.absolute((fnr - fpr)))]
    return eer, eer_threshold

# Training loop with performance evaluation
for epoch in range(800):
    model.train()
    epoch_losses = []
    all_targets = []
    all_predictions = []
    all_scores = []

    for feature_set, label_set in zip(features, segment_labels):
        inputs = torch.tensor(feature_set, dtype=torch.float32)
        targets = torch.tensor(label_set, dtype=torch.float32)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())

        # Collect predictions and targets for evaluation
        preds = (outputs > 0.5).float()  # Convert probabilities to binary predictions
        all_targets.extend(targets.numpy().flatten())
        all_predictions.extend(preds.detach().numpy().flatten())
        all_scores.extend(outputs.detach().numpy().flatten())

    # Calculate confusion matrix and Equal Error Rate (EER)
    cm = confusion_matrix(all_targets, all_predictions)
    eer, eer_threshold = calculate_eer(all_targets, all_scores)

    # Print performance metrics
    print(f"Epoch {epoch+1}:")
    print(f"  Loss: {np.mean(epoch_losses):.4f}")
    print(f"  Confusion Matrix:\n{cm}")
    print(f"  Equal Error Rate (EER): {eer:.4f} at threshold {eer_threshold:.4f}\n")

# Classify and evaluate
utterance_level = False  # Toggle for utterance or segment-level evaluation
frame_scores, final_labels, frame_labels = classify_and_evaluate(features, segment_labels, utterance_level, model)

if not os.path.exists(save_path):
    os.makedirs(save_path)
# Define the path to save the results
results_path = os.path.join(save_path, 'results.txt')

with open(results_path, 'w') as file:
    for i, (scores, final_label, frame_label) in enumerate(zip(frame_scores, final_labels, frame_labels)):
        audio_id = audio_paths[i].split("\\")[-1].replace(".wav", "")  # Extract file ID from the audio path
        file.write(f"Audio ID: {audio_id} - Final Label: {final_label}, Frame Labels: {frame_label}, Scores: {scores}\n")
        
# Print to verify the results
for i, (scores, final_label, frame_label) in enumerate(zip(frame_scores, final_labels, frame_labels)):
    audio_id = audio_paths[i].split("\\")[-1].replace(".wav", "")  # Extract file ID from the audio path
    print(f"Audio ID: {audio_id} - Final Label: {final_label}, Frame Labels: {frame_label}, Scores: {scores}")

Using device: cuda


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
	This function was moved to 'librosa.feature.rhythm.tempo' in librosa version 0.10.0.
	This alias will be removed in librosa version 1.0.
  tempo = librosa.beat.tempo(y=windowed_segment, sr=sr)[0]
  return pitch_tuning(


Epoch 1:
  Loss: 0.5880
  Confusion Matrix:
[[20221 15137]
 [11468 38730]]
  Equal Error Rate (EER): 0.3219 at threshold 0.5691

Epoch 2:
  Loss: 0.5626
  Confusion Matrix:
[[20973 14385]
 [10637 39561]]
  Equal Error Rate (EER): 0.3027 at threshold 0.5707

Epoch 3:
  Loss: 0.5469
  Confusion Matrix:
[[21503 13855]
 [10219 39979]]
  Equal Error Rate (EER): 0.2934 at threshold 0.5726

Epoch 4:
  Loss: 0.5353
  Confusion Matrix:
[[21789 13569]
 [ 9932 40266]]
  Equal Error Rate (EER): 0.2854 at threshold 0.5745

Epoch 5:
  Loss: 0.5275
  Confusion Matrix:
[[22019 13339]
 [ 9640 40558]]
  Equal Error Rate (EER): 0.2803 at threshold 0.5751

Epoch 6:
  Loss: 0.5210
  Confusion Matrix:
[[22194 13164]
 [ 9462 40736]]
  Equal Error Rate (EER): 0.2756 at threshold 0.5764

Epoch 7:
  Loss: 0.5156
  Confusion Matrix:
[[22269 13089]
 [ 9213 40985]]
  Equal Error Rate (EER): 0.2710 at threshold 0.5783

Epoch 8:
  Loss: 0.5107
  Confusion Matrix:
[[22506 12852]
 [ 9178 41020]]
  Equal Error Rate (EE

In [None]:
import os
import numpy as np
import pandas as pd
from glob import glob
import librosa
import torch
from tqdm import tqdm
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_recall_curve, auc, roc_curve
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from scipy.optimize import brentq
from scipy.interpolate import interp1d


window_size = 0.16  # Segment length in seconds
hop_size = 0.16     # Frame shift in seconds
extract_utterance_level = False 
# Determine if a GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Initialize paths and parameters
model_name = "facebook/wav2vec2-large-xlsr-53"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name).to(device)

train_audio_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\train\\con_wav\\*.wav"
train_utterance_label_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\protocols\\PartialSpoof_LA_cm_protocols\\PartialSpoof.LA.cm.train.trl.txt"
train_segment_label_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\database_segment_labels\\database\\segment_labels\\train_seglab_0.16.npy"

val_audio_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\dev\\con_wav\\*.wav"
val_utterance_label_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\protocols\\PartialSpoof_LA_cm_protocols\\PartialSpoof.LA.cm.dev.trl.txt"
val_segment_label_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\database_segment_labels\\database\\segment_labels\\dev_seglab_0.16.npy"

test_audio_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\eval\\con_wav\\*.wav"
test_utterance_label_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\protocols\\PartialSpoof_LA_cm_protocols\\PartialSpoof.LA.cm.eval.trl.txt"
test_segment_label_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\database_segment_labels\\database\\segment_labels\\eval_seglab_0.16.npy"


# Load labels for training, validation, and testing
train_labels = load_labels(train_utterance_label_path if extract_utterance_level else train_segment_label_path, extract_utterance_level)
val_labels = load_labels(val_utterance_label_path if extract_utterance_level else val_segment_label_path, extract_utterance_level)
test_labels = load_labels(test_utterance_label_path if extract_utterance_level else test_segment_label_path, extract_utterance_level)

# Process datasets
train_audio_paths = glob.glob(train_audio_path)
val_audio_paths = glob.glob(val_audio_path)
test_audio_paths = glob.glob(test_audio_path)

X_train, y_train = process_audio_files(train_audio_paths, train_labels, window_size, hop_size, feature_extractor)
X_val, y_val = process_audio_files(val_audio_paths, val_labels, window_size, hop_size, feature_extractor)
X_test, y_test = process_audio_files(test_audio_paths, test_labels, window_size, hop_size, feature_extractor)

# Initialize model
input_dim = len(X_train[0][0])  # Length of the feature vector for the first segment
model = FrameLevelClassifier(input_dim).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Training loop with evaluation
def train_validate_test(model, criterion, optimizer, X_train, y_train, X_val, y_val, X_test, y_test, epochs=800):
    for epoch in range(epochs):
        model.train()
        train_loss, all_targets, all_scores = [], [], []

        for features, labels in zip(X_train, y_train):
            features_tensor = torch.tensor(features, dtype=torch.float32).to(device)
            labels_tensor = torch.tensor(labels, dtype=torch.float32).to(device)

            optimizer.zero_grad()
            outputs = model(features_tensor)
            loss = criterion(outputs, labels_tensor)
            loss.backward()
            optimizer.step()

            train_loss.append(loss.item())
            all_targets.extend(labels)
            all_scores.extend(outputs.cpu().detach().numpy())

        # Calculate train EER
        train_eer, train_threshold = calculate_eer(all_targets, all_scores)

        # Validate the model
        model.eval()
        val_loss, all_val_targets, all_val_scores = [], [], []
        for features, labels in zip(X_val, y_val):
            with torch.no_grad():
                features_tensor = torch.tensor(features, dtype=torch.float32).to(device)
                labels_tensor = torch.tensor(labels, dtype=torch.float32).to(device)

                outputs = model(features_tensor)
                loss = criterion(outputs, labels_tensor)
                val_loss.append(loss.item())
                all_val_targets.extend(labels)
                all_val_scores.extend(outputs.cpu().numpy())

        # Calculate validation EER
        val_eer, val_threshold = calculate_eer(all_val_targets, all_val_scores)

        # Print Epoch Performance
        print(f"Epoch {epoch + 1}/{epochs}")
        print(f"  Train Loss: {np.mean(train_loss):.4f}, Train EER: {train_eer:.4f}")
        print(f"  Validation Loss: {np.mean(val_loss):.4f}, Validation EER: {val_eer:.4f}")

    # Test the model
    all_test_targets, all_test_scores = [], []
    for features, labels in zip(X_test, y_test):
        with torch.no_grad():
            features_tensor = torch.tensor(features, dtype=torch.float32).to(device)
            outputs = model(features_tensor)

            all_test_targets.extend(labels)
            all_test_scores.extend(outputs.cpu().numpy())

    # Calculate test EER
    test_eer, test_threshold = calculate_eer(all_test_targets, all_test_scores)
    print(f"Test EER: {test_eer:.4f}, Threshold: {test_threshold:.4f}")

train_validate_test(model, criterion, optimizer, X_train, y_train, X_val, y_val, X_test, y_test)
