In [4]:
import os
import numpy as np
import pandas as pd
import librosa
from glob import glob
from tqdm import tqdm
import torch
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model

# Paths and directories for train, dev, and eval
paths = {
    "train": {
        "label_file": r"D:\Datasets\PartialSpoof\database_segment_labels\database\segment_labels\train_seglab_0.16.npy",
        "audio_directory": r"D:\Datasets\PartialSpoof\Train\con_wav",
        "save_path": r"D:\Datasets\PartialSpoof\Segmented_features\train"
    },
    "dev": {
        "label_file": r"D:\Datasets\PartialSpoof\database_segment_labels\database\segment_labels\dev_seglab_0.16.npy",
        "audio_directory": r"D:\Datasets\PartialSpoof\dev\con_wav",
        "save_path": r"D:\Datasets\PartialSpoof\Segmented_features\dev"
    },
    "eval": {
        "label_file": r"D:\Datasets\PartialSpoof\database_segment_labels\database\segment_labels\eval_seglab_0.16.npy",
        "audio_directory": r"D:\Datasets\PartialSpoof\eval\con_wav",
        "save_path": r"D:\Datasets\PartialSpoof\Segmented_features\eval"
    }
}


# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Print statement to confirm whether the model is running on GPU or CPU
if torch.cuda.is_available():
    print("Running on GPU:", torch.cuda.get_device_name(0))  # Print the GPU name
else:
    print("Running on CPU")


# Initialize model for wav2vec-xls-r feature extraction
model_name = "facebook/wav2vec2-large-xlsr-53"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name)

window_size = 0.16  # Segment length in seconds
hop_size = 0.15    # Frame shift in seconds

# Function to extract features from audio
def extract_features_for_segment(audio_directory, label_file, save_path, subset_name):
    # Load segment labels (NumPy)
    segment_labels = np.load(label_file, allow_pickle=True).item()

    all_features = []
    audio_files = glob(audio_directory + "/*.wav") [:]  # Process 10 samples

    for audio_file in tqdm(audio_files, desc=f"Processing {subset_name}"):
        file_id = os.path.basename(audio_file).replace('.wav', '')
        
        # Check if file exists in labels
        if file_id not in segment_labels:
            continue

        y, sr = librosa.load(audio_file, sr=16000)
        segment_labels_for_file = segment_labels[file_id]

        # Segment processing: calculate window and hop length based on audio length
        segment_length = int(window_size * sr)
        hop_length = int(hop_size * sr)
        
        # Loop through the segment labels and process each segment
        for i, seg_label in enumerate(segment_labels_for_file):
            start = i * hop_length
            end = start + segment_length
            if end > len(y):
                # Repeat padding for the last segment if it's smaller than the segment size
                segment = np.pad(y[start:], (0, segment_length - len(y[start:])), mode='edge')
            else:
                segment = y[start:end]

            # Feature extraction (handcrafted and wav2vec)
            handcrafted_features = extract_handcrafted_features(segment, sr)
            wav2vec_features = extract_wav2vec_features(segment, sr)
            combined_features = np.concatenate(([file_id], handcrafted_features, wav2vec_features, [seg_label]))
            all_features.append(combined_features)
    
    # Convert to DataFrame and save in parts
    df = pd.DataFrame(all_features)
    part_size = 100000  # Save 5 features per part, adjust as needed
    for i in range(0, len(df), part_size):
        part_features = df.iloc[i:i + part_size]
        part_name = f"{save_path}/{subset_name}_segment_features_part_{i // part_size + 1}.csv"
        part_features.to_csv(part_name, index=False)

# Handcrafted feature extraction
def extract_handcrafted_features(segment, sr):
    windowed_segment = segment * np.hanning(len(segment))
    mfcc = librosa.feature.mfcc(y=windowed_segment, sr=sr, n_mfcc=13).mean(axis=1)
    delta_mfcc = librosa.feature.delta(mfcc)
    tempo = librosa.beat.tempo(y=windowed_segment, sr=sr)[0]
    chroma = librosa.feature.chroma_stft(y=windowed_segment, sr=sr).mean(axis=1)
    zcr = librosa.feature.zero_crossing_rate(windowed_segment).mean()
    energy = librosa.feature.rms(y=windowed_segment).mean()
    pitches, _ = librosa.core.piptrack(y=windowed_segment, sr=sr)
    pitch = np.mean(pitches[pitches > 0]) if len(pitches[pitches > 0]) > 0 else 0
    tempogram = librosa.feature.tempogram(y=windowed_segment, sr=sr).mean(axis=1)[1:]  # Ignore first column
    downsampled_tempogram = tempogram[::int(np.ceil(len(tempogram) / 18))]
    features = np.concatenate((mfcc, delta_mfcc, [tempo], chroma, [zcr], [energy], [pitch], downsampled_tempogram))
    return features

model.to(device)

# Modify feature extraction function to use GPU
def extract_wav2vec_features(segment, sr):
    if len(segment.shape) > 1:  # Ensure the segment is a 1D array and not a 2D array
        segment = segment.flatten()  # Flatten it to 1D if needed
    inputs = feature_extractor(segment, sampling_rate=sr, return_tensors="pt", padding=False) # Move the inputs to the GPU (if available)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)  # Extract the last hidden state and return it # Move the output back to CPU (optional, depending on what you want to do with it)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

# Process train, dev, and eval datasets
for subset in ['train']: # , 'dev', 'eval'
    print(f"Processing {subset} dataset...")
    extract_features_for_segment(
        audio_directory=paths[subset]["audio_directory"], 
        label_file=paths[subset]["label_file"], 
        save_path=paths[subset]["save_path"],
        subset_name=subset
    )


Running on GPU: GeForce RTX 2070 Super with Max-Q Design
Processing train dataset...


	This function was moved to 'librosa.feature.rhythm.tempo' in librosa version 0.10.0.
	This alias will be removed in librosa version 1.0.
  tempo = librosa.beat.tempo(y=windowed_segment, sr=sr)[0]
Processing train:  42%|██████████████████████▋                               | 10674/25380 [3:42:12<5:06:09,  1.25s/it]


MemoryError: Unable to allocate 1.02 MiB for an array with shape (130, 1025) and data type float64

In [2]:
import os
import numpy as np
import pandas as pd
import librosa
from glob import glob
from tqdm import tqdm
import torch
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model

# Paths and directories for train, dev, and eval
paths = {
    "train": {
        "label_file": r"D:\Datasets\PartialSpoof\database_segment_labels\database\segment_labels\train_seglab_0.16.npy",
        "audio_directory": r"D:\Datasets\PartialSpoof\Train\con_wav",
        "save_path": r"D:\Datasets\PartialSpoof\Segmented_features\train"
    },
    "dev": {
        "label_file": r"D:\Datasets\PartialSpoof\database_segment_labels\database\segment_labels\dev_seglab_0.16.npy",
        "audio_directory": r"D:\Datasets\PartialSpoof\dev\con_wav",
        "save_path": r"D:\Datasets\PartialSpoof\Segmented_features\dev"
    },
    "eval": {
        "label_file": r"D:\Datasets\PartialSpoof\database_segment_labels\database\segment_labels\eval_seglab_0.16.npy",
        "audio_directory": r"D:\Datasets\PartialSpoof\eval\con_wav",
        "save_path": r"D:\Datasets\PartialSpoof\Segmented_features\eval"
    }
}

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Print GPU or CPU info
if torch.cuda.is_available():
    print("Running on GPU:", torch.cuda.get_device_name(0))
else:
    print("Running on CPU")

# Initialize Wav2Vec2 model and feature extractor
model_name = "facebook/wav2vec2-large-xlsr-53"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name).to(device)

# Segment parameters
window_size = 0.16  # Segment length in seconds
hop_size = 0.15  # Frame shift in seconds
batch_size = 5000  # Process audio in batches

# Function to extract features for a segment
def extract_features_for_segment(audio_directory, label_file, save_path, subset_name):
    # Load segment labels
    segment_labels = np.load(label_file, allow_pickle=True).item()
    audio_files = glob(os.path.join(audio_directory, "*.wav"))
    
    # Prepare for batch processing
    batch_features = []
    for idx, audio_file in enumerate(tqdm(audio_files, desc=f"Processing {subset_name}")):
        file_id = os.path.basename(audio_file).replace('.wav', '')
        
        if file_id not in segment_labels:
            continue  # Skip if file_id not in labels

        # Load audio
        y, sr = librosa.load(audio_file, sr=16000)
        segment_labels_for_file = segment_labels[file_id]
        
        # Process segments
        segment_length = int(window_size * sr)
        hop_length = int(hop_size * sr)
        
        for i, seg_label in enumerate(segment_labels_for_file):
            start = i * hop_length
            end = start + segment_length
            segment = (
                np.pad(y[start:], (0, segment_length - len(y[start:])), mode='edge')
                if end > len(y)
                else y[start:end]
            )
            # Extract features
            handcrafted = extract_handcrafted_features(segment, sr)
            wav2vec = extract_wav2vec_features(segment, sr)
            combined = np.concatenate(([file_id], handcrafted, wav2vec, [seg_label]))
            batch_features.append(combined)

        # Save batch periodically
        if len(batch_features) >= batch_size:
            save_batch(batch_features, save_path, subset_name, idx // batch_size + 1)
            batch_features = []  # Reset the batch

    # Save remaining features in the batch
    if batch_features:
        save_batch(batch_features, save_path, subset_name, idx // batch_size + 2)

# Save a batch of features
def save_batch(features, save_path, subset_name, batch_number):
    os.makedirs(save_path, exist_ok=True)
    df = pd.DataFrame(features)
    save_file = os.path.join(save_path, f"{subset_name}_segment_features_part_{batch_number}.csv")
    df.to_csv(save_file, index=False)

# Handcrafted features
def extract_handcrafted_features(segment, sr):
    windowed = segment * np.hanning(len(segment))
    mfcc = librosa.feature.mfcc(y=windowed, sr=sr, n_mfcc=13).mean(axis=1)
    delta_mfcc = librosa.feature.delta(mfcc)
    tempo = librosa.beat.tempo(y=windowed, sr=sr)[0]
    chroma = librosa.feature.chroma_stft(y=windowed, sr=sr).mean(axis=1)
    zcr = librosa.feature.zero_crossing_rate(windowed).mean()
    energy = librosa.feature.rms(y=windowed).mean()
    pitches, _ = librosa.piptrack(y=windowed, sr=sr)
    pitch = np.mean(pitches[pitches > 0]) if len(pitches[pitches > 0]) > 0 else 0
    tempogram = librosa.feature.tempogram(y=windowed, sr=sr).mean(axis=1)[1:]
    downsampled_tempogram = tempogram[::max(1, len(tempogram) // 18)]
    return np.concatenate((mfcc, delta_mfcc, [tempo], chroma, [zcr], [energy], [pitch], downsampled_tempogram))

# Wav2Vec2 features
def extract_wav2vec_features(segment, sr):
    inputs = feature_extractor(segment, sampling_rate=sr, return_tensors="pt", padding=False)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

# Process datasets
for subset in ['eval']: # train  , 'dev', 'eval'
    print(f"Processing {subset} dataset...")
    extract_features_for_segment(
        audio_directory=paths[subset]["audio_directory"],
        label_file=paths[subset]["label_file"],
        save_path=paths[subset]["save_path"],
        subset_name=subset,
    )


Running on GPU: GeForce RTX 2070 Super with Max-Q Design
Processing eval dataset...


	This function was moved to 'librosa.feature.rhythm.tempo' in librosa version 0.10.0.
	This alias will be removed in librosa version 1.0.
  tempo = librosa.beat.tempo(y=windowed, sr=sr)[0]
Processing eval: 100%|████████████████████████████████████████████████████████| 71237/71237 [23:50:07<00:00,  1.20s/it]


In [None]:
import os
import numpy as np
import pandas as pd
import librosa
from glob import glob
from tqdm import tqdm
import torch
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model

# Paths and directories for train, dev, and eval
paths = {
    "train": {
        "label_file": r"D:\Datasets\PartialSpoof\database_segment_labels\database\segment_labels\train_seglab_0.16.npy",
        "audio_directory": r"D:\Datasets\PartialSpoof\Train\con_wav",
        "save_path": r"D:\Datasets\PartialSpoof\Segmented_features\train"
    },
    "dev": {
        "label_file": r"D:\Datasets\PartialSpoof\database_segment_labels\database\segment_labels\dev_seglab_0.16.npy",
        "audio_directory": r"D:\Datasets\PartialSpoof\dev\con_wav",
        "save_path": r"D:\Datasets\PartialSpoof\Segmented_features\dev"
    },
    "eval": {
        "label_file": r"D:\Datasets\PartialSpoof\database_segment_labels\database\segment_labels\eval_seglab_0.16.npy",
        "audio_directory": r"D:\Datasets\PartialSpoof\eval\con_wav",
        "save_path": r"D:\Datasets\PartialSpoof\Segmented_features\eval"
    }
}

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Print GPU or CPU info
if torch.cuda.is_available():
    print("Running on GPU:", torch.cuda.get_device_name(0))
else:
    print("Running on CPU")

# Initialize Wav2Vec2 model and feature extractor
model_name = "facebook/wav2vec2-large-xlsr-53"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name).to(device)

# Segment parameters
window_size = 0.16  # Segment length in seconds
hop_size = 0.15  # Frame shift in seconds
batch_size = 12000  # Process audio in batches

# Function to extract features for a segment
def extract_features_for_segment(audio_directory, label_file, save_path, subset_name):
    # Load segment labels
    segment_labels = np.load(label_file, allow_pickle=True).item()
    audio_files = glob(os.path.join(audio_directory, "*.wav"))
    
    # Prepare for batch processing
    batch_features = []
    for idx, audio_file in enumerate(tqdm(audio_files, desc=f"Processing {subset_name}")):
        file_id = os.path.basename(audio_file).replace('.wav', '')
        
        if file_id not in segment_labels:
            continue  # Skip if file_id not in labels

        # Load audio
        y, sr = librosa.load(audio_file, sr=16000)
        segment_labels_for_file = segment_labels[file_id]
        
        # Process segments
        segment_length = int(window_size * sr)
        hop_length = int(hop_size * sr)
        
        for i, seg_label in enumerate(segment_labels_for_file):
            start = i * hop_length
            end = start + segment_length
            segment = (
                np.pad(y[start:], (0, segment_length - len(y[start:])), mode='edge')
                if end > len(y)
                else y[start:end]
            )
            # Extract features
            handcrafted = extract_handcrafted_features(segment, sr)
            wav2vec = extract_wav2vec_features(segment, sr)
            combined = np.concatenate(([file_id], handcrafted, wav2vec, [seg_label]))
            batch_features.append(combined)

        # Save batch periodically
        if len(batch_features) >= batch_size:
            save_batch(batch_features, save_path, subset_name, idx // batch_size + 1)
            batch_features = []  # Reset the batch

    # Save remaining features in the batch
    if batch_features:
        save_batch(batch_features, save_path, subset_name, idx // batch_size + 2)

# Save a batch of features
def save_batch(features, save_path, subset_name, batch_number):
    os.makedirs(save_path, exist_ok=True)
    df = pd.DataFrame(features)
    save_file = os.path.join(save_path, f"{subset_name}_segment_features_part_{batch_number}.csv")
    df.to_csv(save_file, index=False)

# Handcrafted features
def extract_handcrafted_features(segment, sr):
    windowed = segment * np.hanning(len(segment))
    mfcc = librosa.feature.mfcc(y=windowed, sr=sr, n_mfcc=13).mean(axis=1)
    delta_mfcc = librosa.feature.delta(mfcc)
    tempo = librosa.beat.tempo(y=windowed, sr=sr)[0]
    chroma = librosa.feature.chroma_stft(y=windowed, sr=sr).mean(axis=1)
    zcr = librosa.feature.zero_crossing_rate(windowed).mean()
    energy = librosa.feature.rms(y=windowed).mean()
    pitches, _ = librosa.piptrack(y=windowed, sr=sr)
    pitch = np.mean(pitches[pitches > 0]) if len(pitches[pitches > 0]) > 0 else 0
    tempogram = librosa.feature.tempogram(y=windowed, sr=sr).mean(axis=1)[1:]
    downsampled_tempogram = tempogram[::max(1, len(tempogram) // 18)]
    return np.concatenate((mfcc, delta_mfcc, [tempo], chroma, [zcr], [energy], [pitch], downsampled_tempogram))

# Wav2Vec2 features
def extract_wav2vec_features(segment, sr):
    inputs = feature_extractor(segment, sampling_rate=sr, return_tensors="pt", padding=False)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

# Process datasets
for subset in ['dev']: # train  , 'dev', 'eval', 'eval'
    print(f"Processing {subset} dataset...")
    extract_features_for_segment(
        audio_directory=paths[subset]["audio_directory"],
        label_file=paths[subset]["label_file"],
        save_path=paths[subset]["save_path"],
        subset_name=subset,
    )


Running on GPU: GeForce RTX 2070 Super with Max-Q Design
Processing dev dataset...


	This function was moved to 'librosa.feature.rhythm.tempo' in librosa version 0.10.0.
	This alias will be removed in librosa version 1.0.
  tempo = librosa.beat.tempo(y=windowed, sr=sr)[0]
Processing dev:   0%|                                                             | 43/24844 [00:42<6:37:53,  1.04it/s]

In [2]:
import os
import numpy as np
import pandas as pd

# Paths to the files
label_file_path = r"D:\Datasets\PartialSpoof\database_segment_labels\database\segment_labels\train_seglab_0.16.npy"
csv_dir_path = r"D:\Datasets\PartialSpoof\Segmented_features\train"
csv_file_template = os.path.join(csv_dir_path, "train_segment_features_part_{}.csv")

# Load segment labels from .npy file
segment_labels = np.load(label_file_path, allow_pickle=True).item()

# Function to check the number of segment labels and features
def diagnose_features_and_labels():
    # Initialize dictionary to store diagnostics
    diagnostics = {}
    
    # Iterate through all 7 CSV files
    for part in range(1, 8):
        csv_file = csv_file_template.format(part)
        
        # Load the CSV file
        if not os.path.exists(csv_file):
            print(f"CSV file not found: {csv_file}")
            continue
        print(f"Processing {csv_file}...")
        df = pd.read_csv(csv_file)
        
        # Iterate through each row in the CSV file
        for _, row in df.iterrows():
            file_id = row.iloc[0]  # File ID is in the first column
            
            # Check if the file ID exists in the segment labels
            if file_id in segment_labels:
                num_segment_labels = len(segment_labels[file_id])
                # Track the count of features for this file_id
                if file_id not in diagnostics:
                    diagnostics[file_id] = {"labels_count": num_segment_labels, "features_count": 0}
                diagnostics[file_id]["features_count"] += 1

    # Analyze mismatches and missing file IDs
    mismatched_files = []
    missing_file_ids = set(segment_labels.keys()) - set(diagnostics.keys())

    for file_id, counts in diagnostics.items():
        if counts["labels_count"] != counts["features_count"]:
            mismatched_files.append(
                (file_id, counts["labels_count"], counts["features_count"])
            )

    # Print results
    if mismatched_files:
        print("Mismatched files (file_id, labels_count, features_count):")
        for file_id, labels_count, features_count in mismatched_files:
            print(f"File: {file_id} | Labels: {labels_count} | Features: {features_count}")
    else:
        print("No mismatches found.")

    print(f"\nNumber of audio files in labels but not in features: {len(missing_file_ids)}")
    if missing_file_ids:
        print("Missing file IDs:")
        print(", ".join(list(missing_file_ids)))

# Run the diagnosis
diagnose_features_and_labels()


Processing D:\Datasets\PartialSpoof\Segmented_features\train\train_segment_features_part_1.csv...
Processing D:\Datasets\PartialSpoof\Segmented_features\train\train_segment_features_part_2.csv...
Processing D:\Datasets\PartialSpoof\Segmented_features\train\train_segment_features_part_3.csv...
Processing D:\Datasets\PartialSpoof\Segmented_features\train\train_segment_features_part_4.csv...
Processing D:\Datasets\PartialSpoof\Segmented_features\train\train_segment_features_part_5.csv...
Processing D:\Datasets\PartialSpoof\Segmented_features\train\train_segment_features_part_6.csv...
Processing D:\Datasets\PartialSpoof\Segmented_features\train\train_segment_features_part_7.csv...
No mismatches found.

Number of audio files in labels but not in features: 23944
Missing file IDs:
LA_T_7022615, CON_T_0008080, CON_T_0016241, CON_T_0008098, CON_T_0014943, CON_T_0011485, CON_T_0008804, CON_T_0005998, LA_T_1608170, CON_T_0011202, CON_T_0000851, CON_T_0020155, LA_T_5383824, CON_T_0018917, CON_T_00

In [None]:
import os
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm
import librosa
import torch
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model

# Paths and directories for train, dev, and eval
paths = {
    "train": {
        "label_file": r"F:\Awais_data\Datasets\PartialSpoof\database_segment_labels\database\segment_labels\train_seglab_0.16.npy",
        "audio_directory": r"F:\Awais_data\Datasets\PartialSpoof\Train\con_wav",
        "csv_dir": r"F:\Awais_data\Datasets\PartialSpoof\segmented_features\train",
        "save_path": r"F:\Awais_data\Datasets\PartialSpoof\segmented_features\train\missing_train"
    },
}

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    print("Running on GPU:", torch.cuda.get_device_name(0))
else:
    print("Running on CPU")

# Initialize Wav2Vec2 model and feature extractor
model_name = "facebook/wav2vec2-large-xlsr-53"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name).to(device)

# Segment parameters
window_size = 0.16  # Segment length in seconds
hop_size = 0.15  # Frame shift in seconds
batch_size = 50000  # Process audio in batches

# Detect missing audio features
def detect_missing_features(label_file, csv_dir):
    segment_labels = np.load(label_file, allow_pickle=True).item()
    all_file_ids = set(segment_labels.keys())

    # Gather file IDs from all CSV files
    extracted_file_ids = set()
    for part in range(1, 8):
        csv_file = os.path.join(csv_dir, f"train_segment_features_part_{part}.csv")
        if os.path.exists(csv_file):
            df = pd.read_csv(csv_file)
            extracted_file_ids.update(df.iloc[:, 0].astype(str).tolist())

    # Find missing file IDs
    missing_file_ids = all_file_ids - extracted_file_ids
    return missing_file_ids, segment_labels

# Extract and save features for missing files
def extract_missing_features(missing_file_ids, segment_labels, audio_directory, save_path):
    os.makedirs(save_path, exist_ok=True)
    audio_files = glob(os.path.join(audio_directory, "*.wav"))
    batch_features = []

    for idx, audio_file in enumerate(tqdm(audio_files, desc="Processing missing features")):
        file_id = os.path.basename(audio_file).replace('.wav', '')
        if file_id not in missing_file_ids:
            continue

        y, sr = librosa.load(audio_file, sr=16000)
        segment_labels_for_file = segment_labels[file_id]
        segment_length = int(window_size * sr)
        hop_length = int(hop_size * sr)

        for i, seg_label in enumerate(segment_labels_for_file):
            start = i * hop_length
            end = start + segment_length
            segment = (
                np.pad(y[start:], (0, segment_length - len(y[start:])), mode='edge')
                if end > len(y)
                else y[start:end]
            )
            # Extract features
            handcrafted = extract_handcrafted_features(segment, sr)
            wav2vec = extract_wav2vec_features(segment, sr)
            combined = np.concatenate(([file_id], handcrafted, wav2vec, [seg_label]))
            batch_features.append(combined)

        # Save batch periodically
        if len(batch_features) >= batch_size:
            save_batch(batch_features, save_path, "missing_train", idx // batch_size + 1)
            batch_features = []

    # Save remaining features
    if batch_features:
        save_batch(batch_features, save_path, "missing_train", idx // batch_size + 2)

# Save a batch of features
def save_batch(features, save_path, subset_name, batch_number):
    df = pd.DataFrame(features)
    save_file = os.path.join(save_path, f"{subset_name}_segment_features_part_{batch_number}.csv")
    df.to_csv(save_file, index=False)

# Handcrafted features
def extract_handcrafted_features(segment, sr):
    windowed = segment * np.hanning(len(segment))
    mfcc = librosa.feature.mfcc(y=windowed, sr=sr, n_mfcc=13).mean(axis=1)
    delta_mfcc = librosa.feature.delta(mfcc)
    tempo = librosa.beat.tempo(y=windowed, sr=sr)[0]
    chroma = librosa.feature.chroma_stft(y=windowed, sr=sr).mean(axis=1)
    zcr = librosa.feature.zero_crossing_rate(windowed).mean()
    energy = librosa.feature.rms(y=windowed).mean()
    pitches, _ = librosa.piptrack(y=windowed, sr=sr)
    pitch = np.mean(pitches[pitches > 0]) if len(pitches[pitches > 0]) > 0 else 0
    tempogram = librosa.feature.tempogram(y=windowed, sr=sr).mean(axis=1)[1:]
    downsampled_tempogram = tempogram[::max(1, len(tempogram) // 18)]
    return np.concatenate((mfcc, delta_mfcc, [tempo], chroma, [zcr], [energy], [pitch], downsampled_tempogram))

# Wav2Vec2 features
def extract_wav2vec_features(segment, sr):
    inputs = feature_extractor(segment, sampling_rate=sr, return_tensors="pt", padding=False)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

# Process train subset
print("Processing train dataset...")
missing_file_ids, segment_labels = detect_missing_features(
    label_file=paths["train"]["label_file"],
    csv_dir=paths["train"]["csv_dir"]
)
print(f"Number of missing file IDs: {len(missing_file_ids)}")

extract_missing_features(
    missing_file_ids=missing_file_ids,
    segment_labels=segment_labels,
    audio_directory=paths["train"]["audio_directory"],
    save_path=paths["train"]["save_path"]
)


In [None]:
import os
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm
import librosa
import torch
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model

# Paths and directories for train, dev, and eval
paths = {
    "eval": {
        "label_file": r"F:\Awais_data\Datasets\PartialSpoof\database_segment_labels\database\segment_labels\eval_seglab_0.16.npy",
        "audio_directory": r"F:\Awais_data\Datasets\PartialSpoof\eval\con_wav",
        "csv_dir": r"F:\Awais_data\Datasets\PartialSpoof\segmented_features\eval",
        "save_path": r"F:\Awais_data\Datasets\PartialSpoof\segmented_features\eval\missing_eval"
    },
}

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    print("Running on GPU:", torch.cuda.get_device_name(0))
else:
    print("Running on CPU")

# Initialize Wav2Vec2 model and feature extractor
model_name = "facebook/wav2vec2-large-xlsr-53"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name).to(device)

# Segment parameters
window_size = 0.16  # Segment length in seconds
hop_size = 0.15  # Frame shift in seconds
batch_size = 50000  # Process audio in batches

# Detect missing audio features
def detect_missing_features(label_file, csv_dir):
    segment_labels = np.load(label_file, allow_pickle=True).item()
    all_file_ids = set(segment_labels.keys())

    # Gather file IDs from all CSV files
    extracted_file_ids = set()
    for part in range(1, 20):
        csv_file = os.path.join(csv_dir, f"eval_segment_features_part_{part}.csv")
        if os.path.exists(csv_file):
            df = pd.read_csv(csv_file)
            extracted_file_ids.update(df.iloc[:, 0].astype(str).tolist())

    # Find missing file IDs
    missing_file_ids = all_file_ids - extracted_file_ids
    return missing_file_ids, segment_labels

# Extract and save features for missing files
def extract_missing_features(missing_file_ids, segment_labels, audio_directory, save_path):
    os.makedirs(save_path, exist_ok=True)
    audio_files = glob(os.path.join(audio_directory, "*.wav"))
    batch_features = []

    for idx, audio_file in enumerate(tqdm(audio_files, desc="Processing missing features")):
        file_id = os.path.basename(audio_file).replace('.wav', '')
        if file_id not in missing_file_ids:
            continue

        y, sr = librosa.load(audio_file, sr=16000)
        segment_labels_for_file = segment_labels[file_id]
        segment_length = int(window_size * sr)
        hop_length = int(hop_size * sr)

        for i, seg_label in enumerate(segment_labels_for_file):
            start = i * hop_length
            end = start + segment_length
            segment = (
                np.pad(y[start:], (0, segment_length - len(y[start:])), mode='edge')
                if end > len(y)
                else y[start:end]
            )
            # Extract features
            handcrafted = extract_handcrafted_features(segment, sr)
            wav2vec = extract_wav2vec_features(segment, sr)
            combined = np.concatenate(([file_id], handcrafted, wav2vec, [seg_label]))
            batch_features.append(combined)

        # Save batch periodically
        if len(batch_features) >= batch_size:
            save_batch(batch_features, save_path, "missing_eval", idx // batch_size + 1)
            batch_features = []

    # Save remaining features
    if batch_features:
        save_batch(batch_features, save_path, "missing_eval", idx // batch_size + 2)

# Save a batch of features
def save_batch(features, save_path, subset_name, batch_number):
    df = pd.DataFrame(features)
    save_file = os.path.join(save_path, f"{subset_name}_segment_features_part_{batch_number}.csv")
    df.to_csv(save_file, index=False)

# Handcrafted features
def extract_handcrafted_features(segment, sr):
    windowed = segment * np.hanning(len(segment))
    mfcc = librosa.feature.mfcc(y=windowed, sr=sr, n_mfcc=13).mean(axis=1)
    delta_mfcc = librosa.feature.delta(mfcc)
    tempo = librosa.beat.tempo(y=windowed, sr=sr)[0]
    chroma = librosa.feature.chroma_stft(y=windowed, sr=sr).mean(axis=1)
    zcr = librosa.feature.zero_crossing_rate(windowed).mean()
    energy = librosa.feature.rms(y=windowed).mean()
    pitches, _ = librosa.piptrack(y=windowed, sr=sr)
    pitch = np.mean(pitches[pitches > 0]) if len(pitches[pitches > 0]) > 0 else 0
    tempogram = librosa.feature.tempogram(y=windowed, sr=sr).mean(axis=1)[1:]
    downsampled_tempogram = tempogram[::max(1, len(tempogram) // 18)]
    return np.concatenate((mfcc, delta_mfcc, [tempo], chroma, [zcr], [energy], [pitch], downsampled_tempogram))

# Wav2Vec2 features
def extract_wav2vec_features(segment, sr):
    inputs = feature_extractor(segment, sampling_rate=sr, return_tensors="pt", padding=False)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

# Process train subset
print("Processing eval dataset...")
missing_file_ids, segment_labels = detect_missing_features(
    label_file=paths["eval"]["label_file"],
    csv_dir=paths["eval"]["csv_dir"]
)
print(f"Number of missing file IDs: {len(missing_file_ids)}")

extract_missing_features(
    missing_file_ids=missing_file_ids,
    segment_labels=segment_labels,
    audio_directory=paths["eval"]["audio_directory"],
    save_path=paths["eval"]["save_path"]
)


Running on GPU: NVIDIA GeForce RTX 4090


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing eval dataset...
Number of missing file IDs: 65365


	This function was moved to 'librosa.feature.rhythm.tempo' in librosa version 0.10.0.
	This alias will be removed in librosa version 1.0.
  tempo = librosa.beat.tempo(y=windowed, sr=sr)[0]
  return pitch_tuning(
Processing missing features:  30%|████████████▌                             | 21376/71237 [3:51:22<23:28:17,  1.69s/it]