In [56]:
import os
import numpy as np
import pandas as pd
from glob import glob
import librosa
import torch
from tqdm import tqdm
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_recall_curve, auc
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

# Initialize paths and parameters
model_name = "facebook/wav2vec2-large-xlsr-53"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name)

train_audio_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\train\\con_wav\\*.wav"
segment_label_path = r"F:\Awais_data\Datasets\PartialSpoof\database_segment_labels\database\segment_labels\train_seglab_0.16.npy"
utterance_label_path = r"F:\Awais_data\Datasets\PartialSpoof\protocols\PartialSpoof_LA_cm_protocols\PartialSpoof.LA.cm.train.trl.txt"
save_path = r"C:\\Notebooks\\rrl_source\\dataset_raw\\merge\\new\\"

window_size = 0.16 # Segment length in seconds
hop_size = 0.16    # Frame shift in seconds
extract_utterance_level = True  # Set to True for utterance-level, False for segment-level

# Hann window function
def hann_window(length):
    return 0.5 * (1 - np.cos(2 * np.pi * np.arange(length) / (length - 1)))


# Load segment-level or utterance-level labels
def load_labels(label_file, utterance_level):
    labels = {}
    
    if utterance_level:
        # For utterance-level labels, we parse the label file and map the class labels to file_ids
        with open(label_file, 'r') as file:
            for line in file:
                parts = line.strip().split(' ')
                if len(parts) >= 5:  # Assuming the line structure is consistent
                    file_id = parts[1].strip()  # Extract the file_id
                    label = parts[-1].strip()   # Extract the label (e.g., "spoof" or "bonafide")
                    # Assign 0 for spoof, 1 for real/bonafide
                    labels[file_id] = 0 if label == 'spoof' else 1
    else:
        # If working with segment-level labels
        labels = np.load(label_file, allow_pickle=True).item()

    return labels

def extract_handcrafted_features(segment, sr, target_size=60):
    windowed_segment = segment * hann_window(len(segment))
    mfcc = librosa.feature.mfcc(y=windowed_segment, sr=sr, n_mfcc=13).mean(axis=1)
    delta_mfcc = librosa.feature.delta(mfcc)
    tempo = librosa.beat.tempo(y=windowed_segment, sr=sr)[0]
    chroma = librosa.feature.chroma_stft(y=windowed_segment, sr=sr).mean(axis=1)
    zcr = librosa.feature.zero_crossing_rate(windowed_segment).mean()
    energy = librosa.feature.rms(y=windowed_segment).mean()
    pitches, _ = librosa.core.piptrack(y=windowed_segment, sr=sr)
    pitch = np.mean(pitches[pitches > 0]) if len(pitches[pitches > 0]) > 0 else 0
    tempogram = librosa.feature.tempogram(y=windowed_segment, sr=sr).mean(axis=1)
    downsampled_tempogram = tempogram[::int(np.ceil(len(tempogram) / 18))]
    
    features = np.concatenate((mfcc, delta_mfcc, [tempo], chroma, [zcr], [energy], [pitch], downsampled_tempogram))
    
    # Pad or truncate features to the target size
    if len(features) < target_size:
        features = np.pad(features, (0, target_size - len(features)), mode='constant')
    elif len(features) > target_size:
        features = features[:target_size]
        return features

# Wav2Vec2 feature extraction function
def extract_wav2vec_features(segment, sr, target_size=1024):
    if len(segment.shape) > 1:   # Ensure the segment is a 1D array and not a 2D array
        segment = segment.flatten()  # Flatten it to 1D if needed
    
    inputs = feature_extractor(segment, sampling_rate=sr, return_tensors="pt", padding=False)
    
    with torch.no_grad():
        outputs = model(**inputs)  # Extract the last hidden state and return it
    wav2vec_features = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    
    # Pad or truncate to fixed size (1024)
    if len(wav2vec_features) < target_size:
        wav2vec_features = np.pad(wav2vec_features, (0, target_size - len(wav2vec_features)), mode='constant')
    elif len(wav2vec_features) > target_size:
        wav2vec_features = wav2vec_features[:target_size]
    
    return wav2vec_features

# Main processing function
def process_dataset(audio_files, labels, utterance_level):
    all_features = []
    all_labels = []
    for audio_file in tqdm(audio_files, desc="Processing files"):
        file_id = os.path.basename(audio_file).replace('.wav', '')
        if file_id not in labels:
            print('no file id exist')
            continue

        y, sr = librosa.load(audio_file, sr=16000)
        if utterance_level:
            label = labels[file_id]
            handcrafted_features = extract_handcrafted_features(y, sr)
            wav2vec_features = extract_wav2vec_features(y, sr)
            combined_features = np.concatenate((handcrafted_features, wav2vec_features))
            all_features.append(combined_features)
            all_labels.append(label)
        else:
            segment_labels = labels[file_id]
            segment_length = int(window_size * sr)
            hop_length = int(hop_size * sr)
            for i, seg_label in enumerate(segment_labels):
                start = i * hop_length
                end = start + segment_length
                if end > len(y):
                    break
                segment = y[start:end]
                handcrafted_features = extract_handcrafted_features(segment, sr)
                wav2vec_features = extract_wav2vec_features(segment, sr)
                combined_features = np.concatenate((handcrafted_features, wav2vec_features))
                all_features.append(combined_features)
                all_labels.append(seg_label)

    return np.array(all_features), np.array(all_labels)

# Load labels and process dataset for 100 spoof and 100 real audios
labels = load_labels(utterance_label_path if extract_utterance_level else segment_label_path, extract_utterance_level)

audio_files = glob(train_audio_path)
print(len(audio_files))

# Extract file IDs (for matching with labels)
audio_file_ids = [os.path.basename(file).replace('.wav', '') for file in audio_files]
# print(audio_file_ids[:10])  # Print first 10 file IDs

# Now, match the files based on file_id and their labels
spoof_files = [file for file in audio_files if os.path.basename(file).replace('.wav', '') in labels and labels[os.path.basename(file).replace('.wav', '')] == 0]
print(len(spoof_files))

real_files = [file for file in audio_files if os.path.basename(file).replace('.wav', '') in labels and labels[os.path.basename(file).replace('.wav', '')] == 1]
print(len(real_files))

# Select 100 spoof audios from the start and 100 real audios from the end
selected_spoof_files = spoof_files[:30]
selected_real_files = real_files[-30:]

# Print names and labels of selected audios
for audio_file in selected_spoof_files:
    file_id = os.path.basename(audio_file).replace('.wav', '')
    # print(f"Selected spoof audio: {file_id} - Label: {labels[file_id]}")

for audio_file in selected_real_files:
    file_id = os.path.basename(audio_file).replace('.wav', '')
    # print(f"Selected real audio: {file_id} - Label: {labels[file_id]}")


# Assuming you are only processing the selected spoof and real files for evaluation
# selected_audio_files = selected_spoof_files + selected_real_files

# Process the dataset
features, labels = process_dataset(selected_audio_files, labels, extract_utterance_level)

# After processing, ensure that features and labels are aligned correctly
# assert len(features) == len(selected_audio_files), "Mismatch between number of features and audio files"

# Split dataset into train, validation, and test sets (80-10-10 split)
X_train, X_temp, y_train, y_temp = train_test_split(features, labels, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)


# Ensure that you only select predictions for the selected test audio files
selected_audio_file_ids = [os.path.basename(file).replace('.wav', '') for file in selected_audio_files]

# Make sure all features and labels match correctly
assert len(selected_audio_files) == len(y_pred), "Mismatch between number of files and predictions."

# Create the DataFrame with audio filenames, predictions, and scores
frame_level_predictions = pd.DataFrame({
    'Audio Filename': selected_audio_file_ids,  # Ensure these correspond to the files used in test
    'Prediction': y_pred,  # These are the predictions for the test set
    'Score': y_pred_prob  # Probability scores
})

# Save to CSV
frame_level_predictions.to_csv("frame_level_predictions.csv", index=False)

print("Evaluation complete and results saved.")

# Standardize the features (important for MLP performance)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Train an MLP classifier
mlp = MLPClassifier(hidden_layer_sizes=(512, 256), max_iter=300, random_state=42)
mlp.fit(X_train, y_train)

# Evaluate the model
y_pred = mlp.predict(X_test)
y_pred_prob = mlp.predict_proba(X_test)[:, 1]  # Probabilities for ROC AUC and EER

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)

# ROC AUC
roc_auc = roc_auc_score(y_test, y_pred_prob)

# EER calculation
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_prob)
fpr, tpr, thresholds_roc = roc_curve(y_test, y_pred_prob)
eer = thresholds[np.nanargmin(np.abs(fpr - (1 - tpr)))]

# Save evaluation results to a .txt file
with open("evaluation_results.txt", "w") as f:
    f.write(f"Confusion Matrix:\n{cm}\n\n")
    f.write(f"ROC AUC: {roc_auc}\n")
    f.write(f"EER: {eer}\n")

# Save frame-level predictions to CSV
frame_level_predictions = pd.DataFrame({
    'Audio Filename': selected_audio_files,  # Ensure these correspond to the files used in test
    'Prediction': y_pred,  # These are the predictions for the test set
    'Score': y_pred_prob  # Probability scores
})

# Save to CSV
frame_level_predictions.to_csv("frame_level_predictions.csv", index=False)

print("Evaluation complete and results saved.")

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


25380
22800
2580


	This function was moved to 'librosa.feature.rhythm.tempo' in librosa version 0.10.0.
	This alias will be removed in librosa version 1.0.
  tempo = librosa.beat.tempo(y=windowed_segment, sr=sr)[0]
Processing files: 100%|████████████████████████████████████████████████████████████████| 60/60 [00:18<00:00,  3.23it/s]


AssertionError: Mismatch between number of files and predictions.

In [57]:
print(f"Number of selected audio files: {len(selected_audio_files)}")
print(f"Number of features extracted: {len(features)}")


Number of selected audio files: 60
Number of features extracted: 60


In [58]:
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_pred: {y_pred.shape}")


Shape of X_test: (3, 1084)
Shape of y_pred: (3,)


In [5]:
pwd

'C:\\Notebooks'

In [9]:
import os
import numpy as np
import pandas as pd
from glob import glob
import librosa
import torch
from tqdm import tqdm
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_recall_curve, auc, roc_curve
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

# Initialize paths and parameters
model_name = "facebook/wav2vec2-large-xlsr-53"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name)

train_audio_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\train\\con_wav\\*.wav"
segment_label_path = r"F:\Awais_data\Datasets\PartialSpoof\database_segment_labels\database\segment_labels\train_seglab_0.16.npy"
utterance_label_path = r"F:\Awais_data\Datasets\PartialSpoof\protocols\PartialSpoof_LA_cm_protocols\PartialSpoof.LA.cm.train.trl.txt"
save_path = r"C:\\Notebooks\\rrl_source\\dataset_raw\\merge\\new\\"

window_size = 0.16 # Segment length in seconds
hop_size = 0.16    # Frame shift in seconds
extract_utterance_level = True  # Set to True for utterance-level, False for segment-level

# Hann window function
def hann_window(length):
    return 0.5 * (1 - np.cos(2 * np.pi * np.arange(length) / (length - 1)))

# Load segment-level or utterance-level labels
def load_labels(label_file, utterance_level):
    labels = {}
    
    if utterance_level:
        # For utterance-level labels, we parse the label file and map the class labels to file_ids
        with open(label_file, 'r') as file:
            for line in file:
                parts = line.strip().split(' ')
                if len(parts) >= 5:  # Assuming the line structure is consistent
                    file_id = parts[1].strip()  # Extract the file_id
                    label = parts[-1].strip()   # Extract the label (e.g., "spoof" or "bonafide")
                    # Assign 0 for spoof, 1 for real/bonafide
                    labels[file_id] = 0 if label == 'spoof' else 1
    else:
        # If working with segment-level labels
        labels = np.load(label_file, allow_pickle=True).item()

    return labels

def extract_handcrafted_features(segment, sr, target_size=60):
    windowed_segment = segment * hann_window(len(segment))
    mfcc = librosa.feature.mfcc(y=windowed_segment, sr=sr, n_mfcc=13).mean(axis=1)
    delta_mfcc = librosa.feature.delta(mfcc)
    # tempo = librosa.feature.rhythm.tempo(y=windowed_segment, sr=sr)[0]
    tempo = librosa.beat.tempo(y=windowed_segment, sr=sr)[0]
    chroma = librosa.feature.chroma_stft(y=windowed_segment, sr=sr).mean(axis=1)
    zcr = librosa.feature.zero_crossing_rate(windowed_segment).mean()
    energy = librosa.feature.rms(y=windowed_segment).mean()
    pitches, _ = librosa.core.piptrack(y=windowed_segment, sr=sr)
    pitch = np.mean(pitches[pitches > 0]) if len(pitches[pitches > 0]) > 0 else 0
    tempogram = librosa.feature.tempogram(y=windowed_segment, sr=sr).mean(axis=1)
    downsampled_tempogram = tempogram[::int(np.ceil(len(tempogram) / 18))]
    
    features = np.concatenate((mfcc, delta_mfcc, [tempo], chroma, [zcr], [energy], [pitch], downsampled_tempogram))
    
    # Pad or truncate features to the target size
    if len(features) < target_size:
        features = np.pad(features, (0, target_size - len(features)), mode='constant')
    elif len(features) > target_size:
        features = features[:target_size]
    
    return features

# Wav2Vec2 feature extraction function
def extract_wav2vec_features(segment, sr, target_size=1024):
    if len(segment.shape) > 1:   # Ensure the segment is a 1D array and not a 2D array
        segment = segment.flatten()  # Flatten it to 1D if needed
    
    inputs = feature_extractor(segment, sampling_rate=sr, return_tensors="pt", padding=False)
    
    with torch.no_grad():
        outputs = model(**inputs)  # Extract the last hidden state and return it
    wav2vec_features = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    
    # Pad or truncate to fixed size (1024)
    if len(wav2vec_features) < target_size:
        wav2vec_features = np.pad(wav2vec_features, (0, target_size - len(wav2vec_features)), mode='constant')
    elif len(wav2vec_features) > target_size:
        wav2vec_features = wav2vec_features[:target_size]
    
    return wav2vec_features

# Main processing function
def process_dataset(audio_files, labels, utterance_level):
    all_features = []
    all_labels = []
    for audio_file in tqdm(audio_files, desc="Processing files"):
        file_id = os.path.basename(audio_file).replace('.wav', '')
        if file_id not in labels:
            print('no file id exist')
            continue

        y, sr = librosa.load(audio_file, sr=16000)
        y = librosa.util.fix_length(y, size=int(4 * sr))  # Truncate or pad to 4 seconds
  # Truncate or pad to 4 seconds
        num_frames = int(4 / hop_size)
        
        if utterance_level:
            label = labels[file_id]
            for _ in range(num_frames):
                handcrafted_features = extract_handcrafted_features(y, sr)
                wav2vec_features = extract_wav2vec_features(y, sr)
                combined_features = np.concatenate((handcrafted_features, wav2vec_features))
                all_features.append(combined_features)
                all_labels.append(label)
        else:
            segment_labels = labels[file_id]
            segment_length = int(window_size * sr)
            hop_length = int(hop_size * sr)
            for i in range(num_frames):
                start = i * hop_length
                end = start + segment_length
                segment = y[start:end]
                handcrafted_features = extract_handcrafted_features(segment, sr)
                wav2vec_features = extract_wav2vec_features(segment, sr)
                combined_features = np.concatenate((handcrafted_features, wav2vec_features))
                all_features.append(combined_features)
                all_labels.append(segment_labels[i])

    return np.array(all_features), np.array(all_labels)

# Load labels and process dataset for 100 spoof and 100 real audios
labels = load_labels(utterance_label_path if extract_utterance_level else segment_label_path, extract_utterance_level)

audio_files = glob(train_audio_path)
print(len(audio_files))

# Extract file IDs (for matching with labels)
audio_file_ids = [os.path.basename(file).replace('.wav', '') for file in audio_files]
# print(audio_file_ids[:10])  # Print first 10 file IDs

# Now, match the files based on file_id and their labels
spoof_files = [file for file in audio_files if os.path.basename(file).replace('.wav', '') in labels and labels[os.path.basename(file).replace('.wav', '')] == 0]
print(len(spoof_files))

real_files = [file for file in audio_files if os.path.basename(file).replace('.wav', '') in labels and labels[os.path.basename(file).replace('.wav', '')] == 1]
print(len(real_files))

# Select 100 spoof audios from the start and 100 real audios from the end
selected_spoof_files = spoof_files[:50]  # Selecting the first 100 spoof files
selected_real_files = real_files[-25:]  # Selecting the last 100 real files


# selected_spoof_files = spoof_files  # Use all available spoof files
# selected_real_files = real_files  # Use all available real files


# Print names and labels of selected audios
for audio_file in selected_spoof_files:
    file_id = os.path.basename(audio_file).replace('.wav', '')
    # print(f"Selected spoof audio: {file_id} - Label: {labels[file_id]}")

for audio_file in selected_real_files:
    file_id = os.path.basename(audio_file).replace('.wav', '')
    # print(f"Selected real audio: {file_id} - Label: {labels[file_id]}")

# Combine the selected files
selected_audio_files = selected_spoof_files + selected_real_files

# Process the dataset
features, labels = process_dataset(selected_audio_files, labels, extract_utterance_level)

# Split dataset into train, validation, and test sets (80-10-10 split)
X_train, X_temp, y_train, y_temp = train_test_split(features, labels, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)

# Standardize the features (important for MLP performance)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Train an MLP classifier
mlp = MLPClassifier(hidden_layer_sizes=(512, 256), max_iter=300, random_state=42)
mlp.fit(X_train, y_train)

# Evaluate the model
y_pred = mlp.predict(X_test)
y_pred_prob = mlp.predict_proba(X_test)[:, 1]  # Probabilities for ROC AUC and EER

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)

# ROC AUC
roc_auc = roc_auc_score(y_test, y_pred_prob)

# EER calculation
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_prob)
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
eer = fpr[np.nanargmin(np.abs(fpr - (1 - tpr)))]  # EER is where fpr = 1 - tpr

print(f"Confusion Matrix:\n{cm}")
print(f"ROC AUC: {roc_auc:.4f}")
print(f"EER: {eer:.4f}")

# Save evaluation results to a .txt file
evaluation_file = "C:\\Notebooks\\rrl_source\\dataset_raw\\merge\\new\\evaluation_results.txt"
with open(evaluation_file, "w") as file:
    file.write(f"Confusion Matrix:\n{cm}\n")
    file.write(f"ROC AUC: {roc_auc:.4f}\n")
    file.write(f"EER: {eer:.4f}\n")

# Save frame-level predictions to a .csv file
predictions_df = pd.DataFrame({
    'File ID': selected_audio_files,
    'Prediction': y_pred,
    'Prediction Score': y_pred_prob
})
predictions_file = "C:\\Notebooks\\rrl_source\\dataset_raw\\merge\\new\\frame_level_predictions.csv"
predictions_df.to_csv(predictions_file, index=False)

print(f"Evaluation results and predictions saved to {evaluation_file} and {predictions_file}.")



Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


25380
22800
2580


	This function was moved to 'librosa.feature.rhythm.tempo' in librosa version 0.10.0.
	This alias will be removed in librosa version 1.0.
  tempo = librosa.beat.tempo(y=windowed_segment, sr=sr)[0]
Processing files: 100%|████████████████████████████████████████████████████████████████| 75/75 [14:49<00:00, 11.87s/it]


Confusion Matrix:
[[48  0]
 [ 0 27]]
ROC AUC: 1.0000
EER: 0.0000
Evaluation results and predictions saved to C:\Notebooks\rrl_source\dataset_raw\merge\new\evaluation_results.txt and C:\Notebooks\rrl_source\dataset_raw\merge\new\frame_level_predictions.csv.


In [8]:
import os
import numpy as np
import pandas as pd
from glob import glob
import librosa
import torch
from tqdm import tqdm
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_recall_curve, auc, roc_curve
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

# Initialize paths and parameters
model_name = "facebook/wav2vec2-large-xlsr-53"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name)

train_audio_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\train\\con_wav\\*.wav"
segment_label_path = r"F:\Awais_data\Datasets\PartialSpoof\database_segment_labels\database\segment_labels\train_seglab_0.16.npy"
utterance_label_path = r"F:\Awais_data\Datasets\PartialSpoof\protocols\PartialSpoof_LA_cm_protocols\PartialSpoof.LA.cm.train.trl.txt"
save_path = r"C:\\Notebooks\\rrl_source\\dataset_raw\\merge\\new\\"

window_size = 0.16 # Segment length in seconds
hop_size = 0.16    # Frame shift in seconds
extract_utterance_level = False  # Set to True for utterance-level, False for segment-level

# Hann window function
def hann_window(length):
    return 0.5 * (1 - np.cos(2 * np.pi * np.arange(length) / (length - 1)))

# Load segment-level or utterance-level labels
def load_labels(label_file, utterance_level):
    labels = {}
    
    if utterance_level:
        # For utterance-level labels, we parse the label file and map the class labels to file_ids
        with open(label_file, 'r') as file:
            for line in file:
                parts = line.strip().split(' ')
                if len(parts) >= 5:  # Assuming the line structure is consistent
                    file_id = parts[1].strip()  # Extract the file_id
                    label = parts[-1].strip()   # Extract the label (e.g., "spoof" or "bonafide")
                    # Assign 0 for spoof, 1 for real/bonafide
                    labels[file_id] = 0 if label == 'spoof' else 1
    else:
        # If working with segment-level labels
        labels = np.load(label_file, allow_pickle=True).item()

    return labels

def extract_handcrafted_features(segment, sr, target_size=60):
    windowed_segment = segment * hann_window(len(segment))
    mfcc = librosa.feature.mfcc(y=windowed_segment, sr=sr, n_mfcc=13).mean(axis=1)
    delta_mfcc = librosa.feature.delta(mfcc)
    tempo = librosa.beat.tempo(y=windowed_segment, sr=sr)[0]
    chroma = librosa.feature.chroma_stft(y=windowed_segment, sr=sr).mean(axis=1)
    zcr = librosa.feature.zero_crossing_rate(windowed_segment).mean()
    energy = librosa.feature.rms(y=windowed_segment).mean()
    pitches, _ = librosa.core.piptrack(y=windowed_segment, sr=sr)
    pitch = np.mean(pitches[pitches > 0]) if len(pitches[pitches > 0]) > 0 else 0
    tempogram = librosa.feature.tempogram(y=windowed_segment, sr=sr).mean(axis=1)
    downsampled_tempogram = tempogram[::int(np.ceil(len(tempogram) / 18))]
    
    features = np.concatenate((mfcc, delta_mfcc, [tempo], chroma, [zcr], [energy], [pitch], downsampled_tempogram))
    
    # Pad or truncate features to the target size
    if len(features) < target_size:
        features = np.pad(features, (0, target_size - len(features)), mode='constant')
    elif len(features) > target_size:
        features = features[:target_size]
    
    return features

# Wav2Vec2 feature extraction function
def extract_wav2vec_features(segment, sr, target_size=1024):
    if len(segment.shape) > 1:   # Ensure the segment is a 1D array and not a 2D array
        segment = segment.flatten()  # Flatten it to 1D if needed
    
    inputs = feature_extractor(segment, sampling_rate=sr, return_tensors="pt", padding=False)
    
    with torch.no_grad():
        outputs = model(**inputs)  # Extract the last hidden state and return it
    wav2vec_features = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    
    # Pad or truncate to fixed size (1024)
    if len(wav2vec_features) < target_size:
        wav2vec_features = np.pad(wav2vec_features, (0, target_size - len(wav2vec_features)), mode='constant')
    elif len(wav2vec_features) > target_size:
        wav2vec_features = wav2vec_features[:target_size]
    
    return wav2vec_features

# Main processing function
def process_dataset(audio_files, labels, utterance_level):
    all_features = []
    all_labels = []
    window_size_samples = int(window_size * 16000)  # Convert segment length to samples
    hop_length_samples = int(hop_size * 16000)  # Convert hop size to samples

    for audio_file in tqdm(audio_files, desc="Processing files"):
        file_id = os.path.basename(audio_file).replace('.wav', '')
        if file_id not in labels:
            print(f"No label found for file ID {file_id}. Skipping.")
            continue

        y, sr = librosa.load(audio_file, sr=16000)
        y = librosa.util.fix_length(y, size=int(4 * sr))  # Truncate or pad to 4 seconds (can be adjusted)

        num_frames = int(4 / hop_size)  # Total number of frames for 4-second audio

        if utterance_level:
            # For utterance-level, apply the same label to all frames
            label = labels[file_id]
            for _ in range(num_frames):
                handcrafted_features = extract_handcrafted_features(y, sr)
                wav2vec_features = extract_wav2vec_features(y, sr)
                combined_features = np.concatenate((handcrafted_features, wav2vec_features))
                all_features.append(combined_features)
                all_labels.append(label)
        else:
            # For segment-level processing
            segment_labels = labels[file_id]
            segment_length = int(window_size * sr)
            hop_length = int(hop_size * sr)

            # Pad or repeat segment labels to match the required number of frames
            if len(segment_labels) < num_frames:
                segment_labels = list(segment_labels) + list(segment_labels[:num_frames - len(segment_labels)])
            elif len(segment_labels) > num_frames:
                segment_labels = segment_labels[:num_frames]

            for i in range(num_frames):
                start = i * hop_length
                end = start + segment_length
                segment = y[start:end]

                if len(segment) < segment_length:
                    # Pad segment if it's shorter than expected
                    segment = np.pad(segment, (0, segment_length - len(segment)), mode='constant')

                handcrafted_features = extract_handcrafted_features(segment, sr)
                wav2vec_features = extract_wav2vec_features(segment, sr)
                combined_features = np.concatenate((handcrafted_features, wav2vec_features))
                all_features.append(combined_features)
                all_labels.append(segment_labels[i])

                # Move print statement inside the loop where `i` is defined
                print(f"Indexing at: {i}")

    return np.array(all_features), np.array(all_labels)


# Load labels and process dataset for 100 spoof and 100 real audios
labels = load_labels(utterance_label_path if extract_utterance_level else segment_label_path, extract_utterance_level)

audio_files = glob(train_audio_path)
print(len(audio_files))

# Extract file IDs (for matching with labels)
audio_file_ids = [os.path.basename(file).replace('.wav', '') for file in audio_files]
print(audio_file_ids[:10])  # Print first 10 file IDs

spoof_files = []
real_files = []
for file in audio_files:
    file_id = os.path.basename(file).replace('.wav', '')
    
    if file_id not in labels:
        print(f"File ID {file_id} not found in labels.")
        continue
    
    if extract_utterance_level:
        # For utterance-level labels
        label = labels[file_id]
        if label == 0:
            spoof_files.append(file)
        elif label == 1:
            real_files.append(file)
    else:
        # For segment-level labels
        segment_labels = labels[file_id]
        # Here, check for segment-level processing logic
        if int(segment_labels[0]) == 0:
            spoof_files.append(file)
        elif int(segment_labels[0]) == 1:
            real_files.append(file)

print(f"Number of spoof files: {len(spoof_files)}")
print(f"Number of real files: {len(real_files)}")
# selected_spoof_files = spoof_files  # Use all available spoof files
# selected_real_files = real_files  # Use all available real files

# Select 100 spoof audios from the start and 100 real audios from the end
selected_spoof_files = spoof_files[:10]  # Selecting the first 100 spoof files
selected_real_files = real_files[-10:]  # Selecting the last 100 real files

# Print names and labels of selected audios
for audio_file in selected_spoof_files:
    file_id = os.path.basename(audio_file).replace('.wav', '')
    # print(f"Selected spoof audio: {file_id} - Label: {labels[file_id]}")

for audio_file in selected_real_files:
    file_id = os.path.basename(audio_file).replace('.wav', '')
    # print(f"Selected real audio: {file_id} - Label: {labels[file_id]}")

# Combine the selected files
selected_audio_files = selected_spoof_files + selected_real_files

# Process the dataset
features, labels = process_dataset(selected_audio_files, labels, extract_utterance_level)

# Split dataset into train, validation, and test sets (80-10-10 split)
X_train, X_temp, y_train, y_temp = train_test_split(features, labels, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)

# Standardize the features (important for MLP performance)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Train an MLP classifier
mlp = MLPClassifier(hidden_layer_sizes=(512, 256), max_iter=300, random_state=42)
mlp.fit(X_train, y_train)

# Evaluate the model
y_pred = mlp.predict(X_test)
y_pred_prob = mlp.predict_proba(X_test)[:, 1]  # Probabilities for ROC AUC and EER

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)

# ROC AUC
roc_auc = roc_auc_score(y_test, y_pred_prob)

# EER calculation
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_prob)
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
eer = fpr[np.nanargmin(np.abs(fpr - (1 - tpr)))]  # EER is where fpr = 1 - tpr

print(f"Confusion Matrix:\n{cm}")
print(f"ROC AUC: {roc_auc:.4f}")
print(f"EER: {eer:.4f}")

# Save evaluation results to a .txt file
evaluation_file = "C:\\Notebooks\\rrl_source\\dataset_raw\\merge\\new\\evaluation_results.txt"
with open(evaluation_file, "w") as file:
    file.write(f"Confusion Matrix:\n{cm}\n")
    file.write(f"ROC AUC: {roc_auc:.4f}\n")
    file.write(f"EER: {eer:.4f}\n")

# Save frame-level predictions to a .csv file
predictions_df = pd.DataFrame({
    'File ID': selected_audio_files,
    'Prediction': y_pred,
    'Prediction Score': y_pred_prob
})
predictions_file = "C:\\Notebooks\\rrl_source\\dataset_raw\\merge\\new\\frame_level_predictions.csv"
predictions_df.to_csv(predictions_file, index=False)

print(f"Evaluation results and predictions saved to {evaluation_file} and {predictions_file}.")

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


25380
['CON_T_0000000', 'CON_T_0000001', 'CON_T_0000002', 'CON_T_0000003', 'CON_T_0000004', 'CON_T_0000005', 'CON_T_0000006', 'CON_T_0000007', 'CON_T_0000008', 'CON_T_0000009']
Number of spoof files: 20518
Number of real files: 4862


	This function was moved to 'librosa.feature.rhythm.tempo' in librosa version 0.10.0.
	This alias will be removed in librosa version 1.0.
  tempo = librosa.beat.tempo(y=windowed_segment, sr=sr)[0]


Indexing at: 0
Indexing at: 1
Indexing at: 2
Indexing at: 3
Indexing at: 4
Indexing at: 5
Indexing at: 6
Indexing at: 7
Indexing at: 8
Indexing at: 9
Indexing at: 10
Indexing at: 11
Indexing at: 12
Indexing at: 13
Indexing at: 14
Indexing at: 15
Indexing at: 16
Indexing at: 17
Indexing at: 18
Indexing at: 19
Indexing at: 20
Indexing at: 21
Indexing at: 22


Processing files:   5%|███▎                                                             | 1/20 [00:02<00:53,  2.84s/it]

Indexing at: 23
Indexing at: 24
Indexing at: 0
Indexing at: 1
Indexing at: 2
Indexing at: 3
Indexing at: 4
Indexing at: 5
Indexing at: 6
Indexing at: 7
Indexing at: 8
Indexing at: 9
Indexing at: 10
Indexing at: 11
Indexing at: 12
Indexing at: 13
Indexing at: 14
Indexing at: 15
Indexing at: 16
Indexing at: 17
Indexing at: 18
Indexing at: 19
Indexing at: 20
Indexing at: 21
Indexing at: 22


Processing files:  10%|██████▌                                                          | 2/20 [00:05<00:49,  2.77s/it]

Indexing at: 23
Indexing at: 24
Indexing at: 0
Indexing at: 1
Indexing at: 2
Indexing at: 3
Indexing at: 4
Indexing at: 5
Indexing at: 6
Indexing at: 7
Indexing at: 8
Indexing at: 9
Indexing at: 10
Indexing at: 11
Indexing at: 12
Indexing at: 13
Indexing at: 14
Indexing at: 15
Indexing at: 16
Indexing at: 17
Indexing at: 18
Indexing at: 19
Indexing at: 20
Indexing at: 21
Indexing at: 22


Processing files:  10%|██████▌                                                          | 2/20 [00:08<01:16,  4.24s/it]

Indexing at: 23





IndexError: list index out of range