In [1]:
# Import necessary libraries
import os
import numpy as np
import librosa
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2Model

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
# Paths to folders
folder_train_file = '../Dataset/ASVSpoof/actual/ASVspoof2017_V2_train'
folder_dev_file = '../Dataset/ASVSpoof/actual/ASVspoof2017_V2_dev'
folder_eval_file = '../Dataset/ASVSpoof/actual/ASVspoof2017_V2_eval'
output_folder = './Voice_Liveness/Audio_Features/'

In [3]:
# Initialize Wav2Vec2 processor and model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
wav2vec_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# Define audio processing functions
def normalize_audio(audio):
    return audio / np.max(np.abs(audio))

def pad_and_truncate_audio(audio, target_duration, sample_rate):
    target_length = int(target_duration * sample_rate)
    if len(audio) < target_length:
        return np.pad(audio, (0, target_length - len(audio)), mode='constant')
    return audio[:target_length]

# Calculate median duration for padding/truncating
def get_median_duration(folder):
    durations = []
    for file in os.listdir(folder):
        file_path = os.path.join(folder, file)
        audio, sr = librosa.load(file_path, sr=None)
        durations.append(librosa.get_duration(y=audio, sr=sr))
    return np.median(durations)

In [None]:
median_duration = get_median_duration(folder_train_file)

In [11]:
def extract_features(folder, output_folder, processor, model, median_duration, name):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    sample_rate=16000

    features = []
    for file in os.listdir(folder):
        file_path = os.path.join(folder, file)
        audio, sr = librosa.load(file_path, sr=sample_rate)
        
        # Normalize, pad & truncate
        audio = normalize_audio(audio)
        audio = pad_and_truncate_audio(audio, median_duration, sample_rate)
        
        # Preprocess audio with Wav2Vec2 processor
        inputs = processor(audio, sampling_rate=sample_rate, return_tensors="pt", padding=True)
        
        # Extract features from Wav2Vec2 model
        with torch.no_grad():
            outputs = model(inputs.input_values).last_hidden_state
            pooled_output = outputs.mean(dim=1)  # Pool to a fixed-size representation
            features.append(pooled_output.squeeze().numpy())
    
    # Save features to disk
    feature_file = os.path.join(output_folder, name)
    np.save(feature_file, features)
    print(f"Features saved to {feature_file}")

In [None]:
# Run feature extraction
# extract_features(folder_train_file, output_folder, processor, wav2vec_model, median_duration)

Features saved to ./Voice_Liveness/Audio_Features/train_features.npy


In [12]:
extract_features(folder_dev_file, output_folder, processor, wav2vec_model, median_duration, 'dev_features.npy')
extract_features(folder_eval_file, output_folder, processor, wav2vec_model, median_duration, 'eval_features.npy')

Features saved to ./Voice_Liveness/Audio_Features/dev_features.npy
Features saved to ./Voice_Liveness/Audio_Features/eval_features.npy
