In [22]:
import os
import numpy as np
import librosa
import pandas as pd
from glob import glob
from tqdm import tqdm  # Import tqdm for progress bar

# Define paths
train_audio_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\dev\\con_wav\\*.wav"
train_label_path = r"F:\Awais_data\Datasets\PartialSpoof\database_segment_labels\database\segment_labels\dev_seglab_0.16.npy"
# save_path = r"C:\Notebooks\rrl\train_features.csv"

if os.path.exists(train_label_path):
    print("File exists.")
else:
    print("File does not exist.")
    
# Load label file with allow_pickle=True
# train_labels = np.load(train_label_path, allow_pickle=True).item()

try:
    train_labels = np.load(train_label_path, allow_pickle=True).item()
    print("File loaded successfully.")
except Exception as e:
    print(f"Error loading file: {e}")


# Feature extraction with window size = 0.16s, hop size = 0.00s
window_size = int(0.16 * 16000)  # Convert 0.16s to number of samples (window size)
hop_size = 0  # Overlap frames completely

# Feature extraction function
def extract_features(y, sr):
    zcr = librosa.feature.zero_crossing_rate(y + 0.0001).mean()  # Avoid division by zero
    energy = np.sum(y ** 2) / len(y)
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr).mean()
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr).mean()
    pitches, _ = librosa.core.piptrack(y=y, sr=sr)
    pitch = np.mean(pitches[pitches > 0]) if np.any(pitches > 0) else 0
    return [zcr, energy, spectral_centroid, spectral_bandwidth, pitch]

# Process each audio file
def process_dataset(audio_files, labels):
    all_features = []

    for i, audio_file in tqdm(enumerate(audio_files), total=len(audio_files), desc="Processing files"):
        file_id = audio_file.split('\\')[-1].replace('.wav', '')  # Extract file ID from path
        if file_id not in labels:
            print(f"Labels not found for {file_id}, skipping...")
            continue
        
        # Load audio
        y, sr = librosa.load(audio_file, sr=None)
        
        # Get the number of segments for this file from the label dictionary
        segment_labels = labels[file_id]
        num_segments = len(segment_labels)
        
        # Determine total length and segment length in samples
        segment_length = int(window_size * sr)
        hop_length = int(hop_size * sr)
        total_length = num_segments * segment_length
        
        # Pad or truncate audio to match the expected length
        if len(y) < total_length:
            y = np.pad(y, (0, total_length - len(y)))
        else:
            y = y[:total_length]
        
        # Extract features for each segment and aggregate
        features = []
        for j in range(num_segments):
            start_sample = j * hop_length
            end_sample = start_sample + segment_length
            segment = y[start_sample:end_sample]
            
            # Extract features for this segment
            segment_features = extract_features(segment, sr)
            features.append(segment_features)
        
        # Flatten features into a single row (1D)
        features = np.mean(features, axis=0)  # Average features across segments
        all_features.append(features)
    
    # Create DataFrame for easy export and analysis
    columns = ['ZCR', 'Energy', 'SpectralCentroid', 'SpectralBandwidth', 'Pitch']
    features_df = pd.DataFrame(all_features, columns=columns)
    return features_df

# Load the audio files
train_audio_files = glob(train_audio_path)

# Process the training set
train_features_df = process_dataset(train_audio_files, train_labels)

# Save the extracted features to a CSV for further use
save_path = r"C:\\Notebooks\\rrl\\"

# Save the extracted features to a CSV for further use
train_features_df.to_csv(save_path + 'dev.csv', index=False)


File exists.
File loaded successfully.


Processing files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25380/25380 [07:45<00:00, 54.51it/s]


In [9]:
# Extract the defaultdict from the object
train_labels_content = train_labels.item()  # Unwrap the object

# Print the first 5 keys and their corresponding values
keys = list(train_labels_content.keys())  # Get all keys from the defaultdict
print(f"First 5 keys: {keys[:5]}")

# Print the first value corresponding to the first key (to inspect the structure of labels)
first_key = keys[0]
print(f"Labels for the first key ({first_key}): {train_labels_content[first_key]}")


First 5 keys: ['LA_T_1000406', 'LA_T_1007571', 'LA_T_1007663', 'LA_T_1011221', 'LA_T_1013597']
Labels for the first key (LA_T_1000406): ['1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1']


In [2]:
import numpy as np

# Define the path to the label file
train_label_path = r"F:\Awais_data\Datasets\PartialSpoof\database_segment_labels\database\segment_labels\dev_seglab_0.16.npy"

# Load the label file with allow_pickle=True
try:
    train_labels = np.load(train_label_path, allow_pickle=True).item()
    print("Label file loaded successfully.")
except Exception as e:
    print(f"Error loading label file: {e}")
    train_labels = None

# Check if the labels were loaded
if train_labels:
    # Get the first 5 keys
    keys = list(train_labels.keys())[:5]
    
    # Print the keys and corresponding labels
    for key in keys:
        print(f"Key: {key}")
        print(f"Labels: {train_labels[key]}")
else:
    print("No labels to display.")


Label file loaded successfully.
Key: LA_D_1024892
Labels: ['1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1']
Key: LA_D_1026868
Labels: ['1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1'
 '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1']
Key: LA_D_1028589
Labels: ['1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1'
 '1' '1' '1' '1' '1' '1' '1' '1']
Key: LA_D_1038259
Labels: ['1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1'
 '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1']
Key: LA_D_1047731
Labels: ['1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1'
 '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1']


In [None]:
import os
import numpy as np
import librosa
import pandas as pd
from glob import glob
from tqdm import tqdm  # Import tqdm for progress bar

# Define paths
train_audio_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\train\\con_wav\\*.wav"
train_label_path = r"F:\Awais_data\Datasets\PartialSpoof\database_segment_labels\database\segment_labels\train_seglab_0.16.npy"
save_path = r"C:\\Notebooks\\rrl\\"

# Load label file with allow_pickle=True
train_labels = np.load(train_label_path, allow_pickle=True).item()

# Parameters
window_size = 0.16  # 160ms segment length
hop_size = 0.16     # 160ms shift, no overlap

# Feature extraction function
def extract_features(y, sr):
    zcr = librosa.feature.zero_crossing_rate(y + 0.0001).mean()  # Avoid division by zero
    energy = np.sum(y ** 2) / len(y)
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr).mean()
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr).mean()
    pitches, _ = librosa.core.piptrack(y=y, sr=sr)
    pitch = np.mean(pitches[pitches > 0]) if np.any(pitches > 0) else 0
    return [zcr, energy, spectral_centroid, spectral_bandwidth, pitch]

# Process each audio file
def process_dataset(audio_files, labels):
    all_features = []

    for i, audio_file in tqdm(enumerate(audio_files), total=len(audio_files), desc="Processing files"):
        file_id = audio_file.split('\\')[-1].replace('.wav', '')  # Extract file ID from path
        if file_id not in labels:
            print(f"Labels not found for {file_id}, skipping...")
            continue
        
        # Load audio
        y, sr = librosa.load(audio_file, sr=None)
        
        # Get the number of segments for this file from the label dictionary
        segment_labels = labels[file_id]
        num_segments = len(segment_labels)
        
        # Determine total length and segment length in samples
        segment_length = int(window_size * sr)
        hop_length = int(hop_size * sr)
        total_length = num_segments * segment_length
        
        # Pad or truncate audio to match the expected length
        if len(y) < total_length:
            y = np.pad(y, (0, total_length - len(y)))
        else:
            y = y[:total_length]
        
        # Extract features for each segment and aggregate
        features = []
        for j in range(num_segments):
            start_sample = j * hop_length
            end_sample = start_sample + segment_length
            segment = y[start_sample:end_sample]
            
            # Extract features for this segment
            segment_features = extract_features(segment, sr)
            features.append(segment_features)
        
        # Flatten features into a single row (1D)
        features = np.mean(features, axis=0)  # Average features across segments

        # Add label information
        label = int(segment_labels[0])  # Assuming the same label for all segments in a file
        polarity = 'positive' if label == 1 else 'negative'
        features = np.append(features, [label, polarity])  # Append label and polarity
        all_features.append(features)

    # Create DataFrame for easy export and analysis
    columns = ['ZCR', 'Energy', 'SpectralCentroid', 'SpectralBandwidth', 'Pitch', 'Label', 'Polarity']
    features_df = pd.DataFrame(all_features, columns=columns)
    return features_df

# Load the audio files
train_audio_files = glob(train_audio_path)

# Process only the first 10 audio files for testing
train_audio_files = train_audio_files[:]

# Process the training set
train_features_df = process_dataset(train_audio_files, train_labels)

# Save the extracted features to a CSV for further use
train_features_df.to_csv(save_path + 'train_Sample.csv', index=False)

# Save the extracted features to a .data file
train_features_df.to_csv(save_path + 'train_sample.data', index=False)

print("Feature extraction and saving completed for the first 10 samples.")


In [None]:
import os
import numpy as np
import librosa
import pandas as pd
from glob import glob
from tqdm import tqdm  # Import tqdm for progress bar

# Define paths
train_audio_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\eval\\con_wav\\*.wav"
train_label_path = r"F:\Awais_data\Datasets\PartialSpoof\database_segment_labels\database\segment_labels\eval_seglab_0.16.npy"
save_path = r"C:\\Notebooks\\rrl\\"

# Load label file with allow_pickle=True
train_labels = np.load(train_label_path, allow_pickle=True).item()

# Parameters
window_size = 0.16  # Segment length in seconds
hop_size = 0.16     # Frame shift in seconds

# Feature extraction function
def extract_features(y, sr):
    zcr = librosa.feature.zero_crossing_rate(y + 0.0001).mean()  # Avoid division by zero
    energy = np.sum(y ** 2) / len(y)
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr).mean()
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr).mean()
    pitches, _ = librosa.core.piptrack(y=y, sr=sr)
    pitch = np.mean(pitches[pitches > 0]) if np.any(pitches > 0) else 0
    return [zcr, energy, spectral_centroid, spectral_bandwidth, pitch]

# Process each audio file
def process_dataset(audio_files, labels):
    all_features = []

    for audio_file in tqdm(audio_files, total=len(audio_files), desc="Processing files"):
        file_id = audio_file.split('\\')[-1].replace('.wav', '')  # Extract file ID from path
        
        if file_id not in labels:
            print(f"Labels not found for {file_id}, skipping...")
            continue
        
        # Load audio
        y, sr = librosa.load(audio_file, sr=None)
        
        # Get segment labels for this file
        segment_labels = labels[file_id]
        
        # Determine segment length and hop length in samples
        segment_length = int(window_size * sr)
        hop_length = int(hop_size * sr)
        
        # Extract features for each segment and aggregate
        num_segments = len(segment_labels)

        for j in range(num_segments):
            start_sample = j * hop_length
            end_sample = start_sample + segment_length
            
            # Check if the segment is within the audio length
            if end_sample > len(y):
                break  # Stop if the segment exceeds audio length

            segment = y[start_sample:end_sample]
            
            # Extract features for this segment
            segment_features = extract_features(segment, sr)

            # Add label information for this segment
            label = int(segment_labels[j])  # Get the label for the current segment
            polarity = 'positive' if label == 1 else 'negative'
            
            # Append file ID, features, label, and polarity
            features_with_label = np.append([file_id], segment_features + [label, polarity])  
            
            all_features.append(features_with_label)  # Append features and label for this segment

    # Create DataFrame for easy export and analysis
    columns = ['FileID', 'ZCR', 'Energy', 'SpectralCentroid', 'SpectralBandwidth', 'Pitch', 'Label', 'Polarity']
    features_df = pd.DataFrame(all_features, columns=columns)
    return features_df

# Load the audio files
train_audio_files = glob(train_audio_path)

# Process only the first 10 audio files for testing
train_audio_files = train_audio_files[:]

# Process the training set
train_features_df = process_dataset(train_audio_files, train_labels)

# Save the extracted features to a CSV for further use
train_features_df.to_csv(save_path + 'eval.csv', index=False)

# Save the extracted features to a .data file
train_features_df.to_csv(save_path + 'eval.data', index=False)

print("Feature extraction and saving completed for the first 10 samples.")


In [10]:
import os
import numpy as np
import librosa
import pandas as pd
from glob import glob
from tqdm import tqdm  # Import tqdm for progress bar

# Define paths
train_audio_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\eval\\con_wav\\*.wav"
train_label_path = r"F:\Awais_data\Datasets\PartialSpoof\database_segment_labels\database\segment_labels\eval_seglab_0.16.npy"
save_path = r"C:\\Notebooks\\rrl\\"

# Load label file with allow_pickle=True
train_labels = np.load(train_label_path, allow_pickle=True).item()

# Parameters
window_size = 0.16  # Segment length in seconds
hop_size = 0.16     # Frame shift in seconds
chunk_size = 500000  # Number of rows per chunk

# Feature extraction function
def extract_features(y, sr):
    zcr = librosa.feature.zero_crossing_rate(y + 0.0001).mean()  # Avoid division by zero
    energy = np.sum(y ** 2) / len(y)
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr).mean()
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr).mean()
    pitches, _ = librosa.core.piptrack(y=y, sr=sr)
    pitch = np.mean(pitches[pitches > 0]) if np.any(pitches > 0) else 0
    return [zcr, energy, spectral_centroid, spectral_bandwidth, pitch]

# Process each audio file
def process_dataset(audio_files, labels):
    all_features = []
    file_counter = 1  # Counter for file naming

    for audio_file in tqdm(audio_files, total=len(audio_files), desc="Processing files"):
        file_id = audio_file.split('\\')[-1].replace('.wav', '')  # Extract file ID from path
        
        if file_id not in labels:
            print(f"Labels not found for {file_id}, skipping...")
            continue
        
        # Load audio
        y, sr = librosa.load(audio_file, sr=None)
        
        # Get segment labels for this file
        segment_labels = labels[file_id]
        
        # Determine segment length and hop length in samples
        segment_length = int(window_size * sr)
        hop_length = int(hop_size * sr)
        
        # Extract features for each segment
        num_segments = len(segment_labels)

        for j in range(num_segments):
            start_sample = j * hop_length
            end_sample = start_sample + segment_length
            
            # Check if the segment is within the audio length
            if end_sample > len(y):
                break  # Stop if the segment exceeds audio length

            segment = y[start_sample:end_sample]
            
            # Extract features for this segment
            segment_features = extract_features(segment, sr)

            # Add label information for this segment
            label = int(segment_labels[j])  # Get the label for the current segment
            polarity = 'positive' if label == 1 else 'negative'
            
            # Append file ID, features, label, and polarity
            features_with_label = np.append([file_id], segment_features + [label, polarity])  
            all_features.append(features_with_label)  # Append features and label for this segment

            # Check if we need to save the current chunk
            if len(all_features) >= chunk_size:
                save_chunk(all_features, save_path, audio_file, file_counter)
                file_counter += 1
                all_features = []  # Reset for next chunk

    # Save any remaining features
    if all_features:
        save_chunk(all_features, save_path, audio_file, file_counter)

def save_chunk(features, save_path, audio_file, file_counter):
    # Determine file name based on path
    if 'dev' in audio_file:
        filename = f"dev{file_counter}.csv"
    elif 'Train' in audio_file:
        filename = f"train{file_counter}.csv"
    elif 'eval' in audio_file:
        filename = f"eval{file_counter}.csv"
    else:
        return  # Skip if path doesn't match any category

    # Create DataFrame for easy export and analysis
    columns = ['FileID', 'ZCR', 'Energy', 'SpectralCentroid', 'SpectralBandwidth', 'Pitch', 'Label', 'Polarity']
    features_df = pd.DataFrame(features, columns=columns)
    
    # Save to CSV and .data file
    features_df.to_csv(os.path.join(save_path, filename), index=False)
    features_df.to_csv(os.path.join(save_path, filename.replace('.csv', '.data')), index=False)

# Load the audio files
train_audio_files = glob(train_audio_path)

# Process only the first 10 audio files for testing
train_audio_files = train_audio_files[:]

# Process the training set
process_dataset(train_audio_files, train_labels)

print("Feature extraction and saving completed.")


Processing files: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 71237/71237 [2:09:50<00:00,  9.14it/s]


Feature extraction and saving completed.


In [11]:
import os
import pandas as pd

def convert_csv_to_data(csv_file_path):
    # Load the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)
    
    # Generate the output .data file path
    data_file_path = csv_file_path.replace('.csv', '.data')
    
    # Save the DataFrame as a .data file (same format as CSV, just different extension)
    df.to_csv(data_file_path, index=False, header=True, sep=',')  # Use comma as delimiter

    print(f"Converted {csv_file_path} to {data_file_path}")

# Example usage
csv_file = "C:\\Notebooks\\rrl\\dataset\\dev_data.csv"  # Path to the CSV file
convert_csv_to_data(csv_file)


Converted C:\Notebooks\rrl\dataset\dev_data.csv to C:\Notebooks\rrl\dataset\dev_data.data


Segment level feature extraction

In [13]:
import os
import numpy as np
import librosa
import pandas as pd
from glob import glob
from tqdm import tqdm  # Import tqdm for progress bar

# Define paths
train_audio_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\train\\con_wav\\*.wav"
train_label_path = r"F:\Awais_data\Datasets\PartialSpoof\database_segment_labels\database\segment_labels\train_seglab_0.16.npy"
save_path = r"C:\\Notebooks\\rrl\\"

# Load label file with allow_pickle=True
train_labels = np.load(train_label_path, allow_pickle=True).item()

# Parameters
window_size = 0.16  # Segment length in seconds
hop_size = 0.16     # Frame shift in seconds

# Feature extraction function
def extract_features(y, sr):
    # MFCC (13 coefficients)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13).mean(axis=1)  # Taking the mean of each MFCC coefficient
    
    # Energy (already calculated by summing squares of the signal)
    energy = np.sum(y ** 2) / len(y)
    
    # Spectral envelope (using the spectral rolloff as a proxy for spectral envelope)
    spectral_envelope = librosa.feature.spectral_rolloff(y=y, sr=sr).mean()
    
    # Pitch (using piptrack as before)
    pitches, _ = librosa.core.piptrack(y=y, sr=sr)
    pitch = np.mean(pitches[pitches > 0]) if np.any(pitches > 0) else 0
    
    # LFCC (Log Frequency Cepstral Coefficients)
    lfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, dct_type=2, norm='ortho').mean(axis=1)
    
    # Concatenate all features
    features = np.concatenate((mfcc, [energy], [spectral_envelope], [pitch], lfcc))
    return features

# Process each audio file
def process_dataset(audio_files, labels):
    all_features = []

    for audio_file in tqdm(audio_files, total=len(audio_files), desc="Processing files"):
        file_id = audio_file.split('\\')[-1].replace('.wav', '')  # Extract file ID from path
        
        if file_id not in labels:
            print(f"Labels not found for {file_id}, skipping...")
            continue
        
        # Load audio
        y, sr = librosa.load(audio_file, sr=None)
        
        # Get segment labels for this file
        segment_labels = labels[file_id]
        
        # Determine segment length and hop length in samples
        segment_length = int(window_size * sr)
        hop_length = int(hop_size * sr)
        
        # Extract features for each segment and aggregate
        num_segments = len(segment_labels)

        for j in range(num_segments):
            start_sample = j * hop_length
            end_sample = start_sample + segment_length
            
            # Check if the segment is within the audio length
            if end_sample > len(y):
                break  # Stop if the segment exceeds audio length

            segment = y[start_sample:end_sample]
            
            # Extract features for this segment
            segment_features = extract_features(segment, sr)

            # Add label information for this segment
            label = int(segment_labels[j])  # Get the label for the current segment
            polarity = 'positive' if label == 1 else 'negative'
            
            # Append file ID, features, label, and polarity
            features_with_label = np.append([file_id], segment_features.tolist() + [label, polarity])  
            
            all_features.append(features_with_label)  # Append features and label for this segment

    # Create DataFrame for easy export and analysis
    mfcc_columns = [f'MFCC_{i+1}' for i in range(13)]
    lfcc_columns = [f'LFCC_{i+1}' for i in range(13)]
    columns = ['FileID'] + mfcc_columns + ['Energy', 'SpectralEnvelope', 'Pitch'] + lfcc_columns + ['Label', 'Polarity']
    features_df = pd.DataFrame(all_features, columns=columns)
    return features_df

# Load the audio files
train_audio_files = glob(train_audio_path)

# Process only the first 10 audio files for testing
train_audio_files = train_audio_files[:]

# Process the training set
train_features_df = process_dataset(train_audio_files, train_labels)

# Save the extracted features to a CSV for further use
train_features_df.to_csv(save_path + 'train_fulldf_mfcc.csv', index=False)

# Save the extracted features to a .data file
train_features_df.to_csv(save_path + 'train_fulldf_mfcc.data', index=False)

print("Feature extraction and saving completed for the first 10 samples.")


Processing files: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 24844/24844 [1:09:39<00:00,  5.94it/s]


Feature extraction and saving completed for the first 10 samples.


Utterance level Feature Extraction

In [16]:
import os
import numpy as np
import librosa
import pandas as pd
from glob import glob
from tqdm import tqdm  # Import tqdm for progress bar

# Define paths
train_audio_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\Train\\con_wav\\*.wav"
train_label_path = r"F:\Awais_data\Datasets\PartialSpoof\protocols\PartialSpoof_LA_cm_protocols\PartialSpoof.LA.cm.train.trl.txt"
save_path = r"C:\\Notebooks\\rrl\\"

def load_labels(label_file):
    labels = {}
    with open(label_file, 'r') as file:
        for line in file:
            parts = line.strip().split(' ')
            if len(parts) >= 5:  # Ensure there are at least 5 parts
                file_id = parts[1].strip()  # Get the audio name (e.g., CON_T_0000029)
                label = parts[-1].strip()  # Get the label (e.g., "spoof" or "bonafide")
                labels[file_id] = label
    return labels

# Load labels
train_labels = load_labels(train_label_path)

# Feature extraction function
def extract_features(y, sr):
    # MFCC (13 coefficients)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13).mean(axis=1)  # Taking the mean of each MFCC coefficient
    
    # Energy (already calculated by summing squares of the signal)
    energy = np.sum(y ** 2) / len(y)
    
    # Spectral envelope (using the spectral rolloff as a proxy for spectral envelope)
    spectral_envelope = librosa.feature.spectral_rolloff(y=y, sr=sr).mean()
    
    # Pitch (using piptrack as before)
    pitches, _ = librosa.core.piptrack(y=y, sr=sr)
    pitch = np.mean(pitches[pitches > 0]) if np.any(pitches > 0) else 0
    
    # LFCC (Log Frequency Cepstral Coefficients)
    lfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, dct_type=2, norm='ortho').mean(axis=1)
    
    # Concatenate all features
    features = np.concatenate((mfcc, [energy], [spectral_envelope], [pitch], lfcc))
    return features

# Process each audio file
def process_dataset(audio_files, labels):
    all_features = []

    for audio_file in tqdm(audio_files, total=len(audio_files), desc="Processing files"):
        file_id = audio_file.split('\\')[-1].replace('.wav', '')  # Extract file ID from path
        
        if file_id not in labels:
            print(f"Labels not found for {file_id}, skipping...")
            continue
        
        # Load audio
        y, sr = librosa.load(audio_file, sr=None)
        
        # Extract features for this utterance
        utterance_features = extract_features(y, sr)

        # Get label for this utterance
        label = labels[file_id]
        polarity = 'positive' if label == 'spoof' else 'negative'
        
        # Append file ID, features, label, and polarity
        features_with_label = np.append([file_id], utterance_features.tolist() + [label, polarity])  
        
        all_features.append(features_with_label)  # Append features and label for this utterance

    # Create DataFrame for easy export and analysis
    mfcc_columns = [f'MFCC_{i+1}' for i in range(13)]
    lfcc_columns = [f'LFCC_{i+1}' for i in range(13)]
    columns = ['FileID'] + mfcc_columns + ['Energy', 'SpectralEnvelope', 'Pitch'] + lfcc_columns + ['Label', 'Polarity']
    features_df = pd.DataFrame(all_features, columns=columns)
    return features_df

# Load the audio files
train_audio_files = glob(train_audio_path)

train_audio_files = train_audio_files[:]

# Process the training set
train_features_df = process_dataset(train_audio_files, train_labels)

# Save the extracted features to a CSV for further use
train_features_df.to_csv(save_path + 'train_fulldf_mfcc.csv', index=False)

# Save the extracted features to a .data file
train_features_df.to_csv(save_path + 'train_fulldf_mfcc.data', index=False)

print("Feature extraction and saving completed.")


Processing files: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 25380/25380 [1:47:45<00:00,  3.93it/s]


Feature extraction and saving completed.


In [13]:
import os
import numpy as np
import librosa
import pandas as pd
from glob import glob
from tqdm import tqdm  # Import tqdm for progress bar

# Define paths
train_audio_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\train\\con_wav\\*.wav"
train_label_path = r"F:\Awais_data\Datasets\PartialSpoof\database_segment_labels\database\segment_labels\train_seglab_0.16.npy"
save_path = r"C:\\Notebooks\\rrl\\"

# Load label file with allow_pickle=True
train_labels = np.load(train_label_path, allow_pickle=True).item()

# Parameters
window_size = 0.16  # Segment length in seconds
hop_size = 0.16     # Frame shift in seconds

# Feature extraction function
def extract_features(y, sr):
    # Apply the Hann window to the audio signal
    windowed_y = y * hann_window(len(y))

    # MFCC (13 coefficients)
    mfcc = librosa.feature.mfcc(y=windowed_y, sr=sr, n_mfcc=13).mean(axis=1)  # Taking the mean across time frames
    
    # Delta MFCC (first-order difference)
    delta_mfcc = librosa.feature.delta(mfcc)  # Delta of MFCC, already 1D after taking the mean
    
    # Tempo estimation (single value)
    # tempo, _ = librosa.beat.beat_track(y=windowed_y, sr=sr)
    onset_env = librosa.onset.onset_strength(y=windowed_y, sr=sr)

    # Use the onset strength to estimate tempo
    tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sr)[0]
    
    # Chroma (12 coefficients)
    chroma = librosa.feature.chroma_stft(y=windowed_y, sr=sr).mean(axis=1)
    
    # Zero-Crossing Rate (ZCR)
    zcr = librosa.feature.zero_crossing_rate(windowed_y).mean()
    
    # Energy (RMS)
    energy = librosa.feature.rms(y=windowed_y).mean()
    
    # Pitch (estimate the pitch of the signal)
    pitches, magnitudes = librosa.core.piptrack(y=windowed_y, sr=sr)
    pitch = np.mean(pitches[pitches > 0]) if len(pitches[pitches > 0]) > 0 else 0
    
    # Tempogram (downsampled to fit remaining 18 features)
    tempogram = librosa.feature.tempogram(y=windowed_y, sr=sr).mean(axis=1)
    downsampled_tempogram = tempogram[::int(np.ceil(len(tempogram) / 18))]  # Downsample to get 18 values

    # Concatenate all features into a 1D feature vector of size 60
    features = np.concatenate((mfcc, delta_mfcc, [tempo], chroma, [zcr], [energy], [pitch], downsampled_tempogram))
    
    # Ensure the length of features is 60
    assert len(features) == 60, f"Feature vector length is {len(features)}, expected 60"
    
    return features

# Process each audio file
def process_dataset(audio_files, labels):
    all_features = []

    for audio_file in tqdm(audio_files, total=len(audio_files), desc="Processing files"):
        file_id = audio_file.split('\\')[-1].replace('.wav', '')  # Extract file ID from path
        
        if file_id not in labels:
            print(f"Labels not found for {file_id}, skipping...")
            continue
        
        # Load audio
        y, sr = librosa.load(audio_file, sr=None)
        
        # Get segment labels for this file
        segment_labels = labels[file_id]
        
        # Determine segment length and hop length in samples
        segment_length = int(window_size * sr)
        hop_length = int(hop_size * sr)
        
        # Extract features for each segment and aggregate
        num_segments = len(segment_labels)

        for j in range(num_segments):
            start_sample = j * hop_length
            end_sample = start_sample + segment_length
            
            # Check if the segment is within the audio length
            if end_sample > len(y):
                break  # Stop if the segment exceeds audio length

            segment = y[start_sample:end_sample]
            
            # Extract features for this segment
            segment_features = extract_features(segment, sr)

            # Add label information for this segment
            label = int(segment_labels[j])  # Get the label for the current segment
            polarity = 'positive' if label == 1 else 'negative'
            
            # Append file ID, features, label, and polarity
            features_with_label = np.append([file_id], segment_features.tolist() + [label, polarity])  
            
            all_features.append(features_with_label)  # Append features and label for this segment

    # Create DataFrame for easy export and analysis
    feature_columns = [f'Feature_{i+1}' for i in range(60)]
    
    columns = ['FileID'] + feature_columns + ['Label', 'Polarity']
    features_df = pd.DataFrame(all_features, columns=columns)
    return features_df

# Load the audio files
train_audio_files = glob(train_audio_path)

# Process only the first 10 audio files for testing
train_audio_files = train_audio_files[:]

# Process the training set
train_features_df = process_dataset(train_audio_files, train_labels)

# Save the extracted features to a CSV for further use
train_features_df.to_csv(save_path + 'train_STD.csv', index=False)

# Save the extracted features to a .data file
# train_features_df.to_csv(save_path + 'train_STDc.data', index=False)

print("Feature extraction and saving completed.")


	This function was moved to 'librosa.feature.rhythm.tempo' in librosa version 0.10.0.
	This alias will be removed in librosa version 1.0.
  tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sr)[0]
Processing files: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 25380/25380 [5:33:44<00:00,  1.27it/s]


Feature extraction and saving completed.


In [5]:
import os
import numpy as np
import librosa
import pandas as pd
from glob import glob
from tqdm import tqdm  # Import tqdm for progress bar

# Define paths for segment and utterance level labels
train_audio_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\train\\con_wav\\*.wav"
segment_label_path = r"F:\Awais_data\Datasets\PartialSpoof\database_segment_labels\database\segment_labels\train_seglab_0.16.npy"
utterance_label_path = r"F:\Awais_data\Datasets\PartialSpoof\protocols\PartialSpoof_LA_cm_protocols\PartialSpoof.LA.cm.train.trl.txt"
save_path = r"C:\\Notebooks\\rrl_source\\dataset_raw"

# Parameters
window_size = 0.16  # Segment length in seconds
hop_size = 0.16     # Frame shift in seconds


def hann_window(length):
    return 0.5 * (1 - np.cos(2 * np.pi * np.arange(length) / (length - 1)))

# Variable to choose whether to extract segment-level or utterance-level features
extract_utterance_level = False  # Set to True if you want utterance-level, False for segment-level

# Load segment-level labels (from npy)
def load_segment_labels(label_file):
    return np.load(label_file, allow_pickle=True).item()

# Load utterance-level labels (from txt)
def load_utterance_labels(label_file):
    labels = {}
    with open(label_file, 'r') as file:
        for line in file:
            parts = line.strip().split(' ')
            if len(parts) >= 5:  # Ensure there are at least 5 parts
                file_id = parts[1].strip()  # Get the audio name (e.g., CON_T_0000029)
                label = parts[-1].strip()  # Get the label (e.g., "spoof" or "bonafide")
                labels[file_id] = label
    return labels

# Feature extraction function remains unchanged
def extract_features(y, sr):   
    windowed_y = y * hann_window(len(y)  # Apply the Hann window to the audio signal
    mfcc = librosa.feature.mfcc(y=windowed_y, sr=sr, n_mfcc=13).mean(axis=1) # MFCC (13 coefficients)
    delta_mfcc = librosa.feature.delta(mfcc)  # Delta MFCC (first-order difference)  
    onset_env = librosa.onset.onset_strength(y=windowed_y, sr=sr)
    tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sr)[0]   # Tempo estimation  
    chroma = librosa.feature.chroma_stft(y=windowed_y, sr=sr).mean(axis=1)    # Chroma (12 coefficients)  
    zcr = librosa.feature.zero_crossing_rate(windowed_y).mean()      # Zero-Crossing Rate (ZCR)   
    energy = librosa.feature.rms(y=windowed_y).mean()     # Energy (RMS)   
    pitches, magnitudes = librosa.core.piptrack(y=windowed_y, sr=sr)   # Pitch (estimate the pitch of the signal)
    pitch = np.mean(pitches[pitches > 0]) if len(pitches[pitches > 0]) > 0 else 0    
    tempogram = librosa.feature.tempogram(y=windowed_y, sr=sr).mean(axis=1) # Tempogram (downsampled to fit remaining 18 features)
    downsampled_tempogram = tempogram[::int(np.ceil(len(tempogram) / 18))]
    features = np.concatenate((mfcc, delta_mfcc, [tempo], chroma, [zcr], [energy], [pitch], downsampled_tempogram))
    assert len(features) == 60, f"Feature vector length is {len(features)}, expected 60"    
    return features

# Process each audio file
def process_dataset(audio_files, labels, extract_utterance_level):
    all_features = []

    for audio_file in tqdm(audio_files, total=len(audio_files), desc="Processing files"):
        file_id = audio_file.split('\\')[-1].replace('.wav', '')
        
        if file_id not in labels:
            print(f"Labels not found for {file_id}, skipping...")
            continue
        
        y, sr = librosa.load(audio_file, sr=None)
        
        if extract_utterance_level:
            # Extract features for entire utterance
            features = extract_features(y, sr)            
            # Use the utterance-level label
            label = 1 if labels[file_id] == 'spoof' else 0
            polarity = 'positive' if label == 1 else 'negative'            
            features_with_label = np.append([file_id], features.tolist() + [label, polarity])
            all_features.append(features_with_label)

        else:
            # Segment-level feature extraction
            segment_labels = labels[file_id]
            segment_length = int(window_size * sr)
            hop_length = int(hop_size * sr)
            num_segments = len(segment_labels)
            for j in range(num_segments):
                start_sample = j * hop_length
                end_sample = start_sample + segment_length                
                if end_sample > len(y):
                    break
                segment = y[start_sample:end_sample]
                segment_features = extract_features(segment, sr)         
                label = int(segment_labels[j])
                polarity = 'positive' if label == 1 else 'negative'
                features_with_label = np.append([file_id], segment_features.tolist() + [label, polarity])
                all_features.append(features_with_label)

    feature_columns = ['MFCC_' + str(i+1) for i in range(13)] + \
                      ['d_MFCC_' + str(i+1) for i in range(13)] + \
                      ['Tempo'] + \
                      ['Chroma_' + str(i+1) for i in range(12)] + \
                      ['ZCR', 'Energy', 'Pitch'] + \
                      ['Tempogram_' + str(i+1) for i in range(18)]
    
    columns = ['FileID'] + feature_columns + ['Label', 'Polarity']
    features_df = pd.DataFrame(all_features, columns=columns)
    return features_df

# Load labels based on extract_utterance_level flag
if extract_utterance_level:
    train_labels = load_utterance_labels(utterance_label_path)
else:
    train_labels = load_segment_labels(segment_label_path)

# Load the audio files
train_audio_files = glob(train_audio_path)
train_audio_files=train_audio_files[:]
train_features_df = process_dataset(train_audio_files, train_labels, extract_utterance_level)

if extract_utterance_level:
    train_features_df.to_csv(save_path + 'train_utterance_MDSTD.csv', index=False)
else:
    train_features_df.to_csv(save_path + 'train_segment_MDSTD.csv', index=False)

print("Feature extraction and saving completed.")


	This function was moved to 'librosa.feature.rhythm.tempo' in librosa version 0.10.0.
	This alias will be removed in librosa version 1.0.
  tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sr)[0]
  return pitch_tuning(
Processing files: 100%|████████████████████████████████████████████████████████| 25380/25380 [1:36:13<00:00,  4.40it/s]


Feature extraction and saving completed.


In [16]:
import os
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm
import librosa
import torch
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model

# Initialize paths and parameters
model_name = "facebook/wav2vec2-large-xlsr-53"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name)

train_audio_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\train\\con_wav\\*.wav"
segment_label_path = r"F:\Awais_data\Datasets\PartialSpoof\database_segment_labels\database\segment_labels\train_seglab_0.16.npy"
utterance_label_path = r"F:\Awais_data\Datasets\PartialSpoof\protocols\PartialSpoof_LA_cm_protocols\PartialSpoof.LA.cm.train.trl.txt"
save_path = r"C:\\Notebooks\\rrl_source\\dataset_raw\\"

window_size = 0.16  # Segment length in seconds
hop_size = 0.16     # Frame shift in seconds
extract_utterance_level = False  # Set to True if you want utterance-level, False for segment-level

# Label loading functions
def load_segment_labels(label_file):
    return np.load(label_file, allow_pickle=True).item()

def load_utterance_labels(label_file):
    labels = {}
    with open(label_file, 'r') as file:
        for line in file:
            parts = line.strip().split(' ')
            if len(parts) >= 5:
                file_id = parts[1].strip()
                label = parts[-1].strip()
                labels[file_id] = label
    return labels

# Wav2Vec2 feature extraction function
def extract_features_wav2vec(segment, sr):
    inputs = feature_extractor(segment, sampling_rate=sr, return_tensors="pt", padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    segment_features = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return segment_features

# Process dataset with Wav2Vec2-XLSR features
def process_dataset(audio_files, labels, extract_utterance_level):
    all_features = []

    for audio_file in tqdm(audio_files, total=len(audio_files), desc="Processing files"):
        file_id = audio_file.split('\\')[-1].replace('.wav', '')
        
        if file_id not in labels:
            print(f"Labels not found for {file_id}, skipping...")
            continue
        
        y, sr = librosa.load(audio_file, sr=16000)
        
        if extract_utterance_level:
            # Extract features for the entire utterance
            features = extract_features_wav2vec(y, sr)
            label = 1 if labels[file_id] == 'spoof' else 0
            polarity = 'positive' if label == 1 else 'negative'
            features_with_label = np.append([file_id], features.tolist() + [label, polarity])
            all_features.append(features_with_label)
        else:
            # Segment-level feature extraction
            segment_labels = labels[file_id]
            segment_length = int(window_size * sr)
            hop_length = int(hop_size * sr)
            num_segments = len(segment_labels)
            
            for j in range(num_segments):
                start_sample = j * hop_length
                end_sample = start_sample + segment_length
                
                if end_sample > len(y):
                    break
                
                segment = y[start_sample:end_sample]
                segment_features = extract_features_wav2vec(segment, sr)
                label = int(segment_labels[j])
                polarity = 'positive' if label == 1 else 'negative'
                features_with_label = np.append([file_id], segment_features.tolist() + [label, polarity])
                all_features.append(features_with_label)

    feature_columns = ['Wav2Vec2_Feature_' + str(i+1) for i in range(len(segment_features))]
    columns = ['FileID'] + feature_columns + ['Label', 'Polarity']
    features_df = pd.DataFrame(all_features, columns=columns)
    return features_df

# Load labels based on extract_utterance_level flag
if extract_utterance_level:
    train_labels = load_utterance_labels(utterance_label_path)
else:
    train_labels = load_segment_labels(segment_label_path)

# Process audio files
train_audio_files = glob(train_audio_path)
train_audio_files = train_audio_files[5000:25380]
train_features_df = process_dataset(train_audio_files, train_labels, extract_utterance_level)

# Save the resulting features to CSV
if extract_utterance_level:
    train_features_df.to_csv(save_path + 'train_utterance_Wav2Vec2_1024.csv', index=False)
else:
    train_features_df.to_csv(save_path + 'train_segment_Wav2Vec2_1024.csv', index=False)

print("Feature extraction with Wav2Vec2-XLSR completed.")


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing files: 100%|███████████████████████████████████████████████████████| 20380/20380 [10:27:04<00:00,  1.85s/it]


Feature extraction with Wav2Vec2-XLSR completed.


In [10]:
pwd

'C:\\Notebooks'

In [None]:
import os
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm
import librosa
import torch
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model

# Initialize paths and parameters
model_name = "facebook/wav2vec2-large-xlsr-53"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name)

train_audio_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\train\\con_wav\\*.wav"
segment_label_path = r"F:\Awais_data\Datasets\PartialSpoof\database_segment_labels\database\segment_labels\train_seglab_0.16.npy"
utterance_label_path = r"F:\Awais_data\Datasets\PartialSpoof\protocols\PartialSpoof_LA_cm_protocols\PartialSpoof.LA.cm.train.trl.txt"
save_path = r"C:\\Notebooks\\rrl_source\\dataset_raw\\"

window_size = 0.16  # Segment length in seconds
hop_size = 0.16     # Frame shift in seconds
extract_utterance_level = False  # Set to True if you want utterance-level, False for segment-level

# Label loading functions
def load_segment_labels(label_file):
    return np.load(label_file, allow_pickle=True).item()

def load_utterance_labels(label_file):
    labels = {}
    with open(label_file, 'r') as file:
        for line in file:
            parts = line.strip().split(' ')
            if len(parts) >= 5:
                file_id = parts[1].strip()
                label = parts[-1].strip()
                labels[file_id] = label
    return labels

# Wav2Vec2 feature extraction function
def extract_features_wav2vec(segment, sr):
    inputs = feature_extractor(segment, sampling_rate=sr, return_tensors="pt", padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    segment_features = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return segment_features

# Process dataset with Wav2Vec2-XLSR features
def process_dataset(audio_files, labels, extract_utterance_level):
    all_features = []

    for audio_file in tqdm(audio_files, total=len(audio_files), desc="Processing files"):
        file_id = audio_file.split('\\')[-1].replace('.wav', '')
        
        if file_id not in labels:
            print(f"Labels not found for {file_id}, skipping...")
            continue
        
        y, sr = librosa.load(audio_file, sr=16000)
        
        if extract_utterance_level:
            # Extract features for the entire utterance
            features = extract_features_wav2vec(y, sr)
            label = 1 if labels[file_id] == 'spoof' else 0
            features_with_label = np.append(features.tolist(), label)
            all_features.append(features_with_label)
        else:
            # Segment-level feature extraction
            segment_labels = labels[file_id]
            segment_length = int(window_size * sr)
            hop_length = int(hop_size * sr)
            num_segments = len(segment_labels)
            
            for j in range(num_segments):
                start_sample = j * hop_length
                end_sample = start_sample + segment_length
                
                if end_sample > len(y):
                    break
                
                segment = y[start_sample:end_sample]
                segment_features = extract_features_wav2vec(segment, sr)
                label = int(segment_labels[j])
                features_with_label = np.append(segment_features.tolist(), label)
                all_features.append(features_with_label)

    return pd.DataFrame(all_features)

# Load labels based on extract_utterance_level flag
if extract_utterance_level:
    train_labels = load_utterance_labels(utterance_label_path)
else:
    train_labels = load_segment_labels(segment_label_path)

# Process audio files
train_audio_files = glob(train_audio_path)
train_audio_files = train_audio_files[5000:25380]
train_features_df = process_dataset(train_audio_files, train_labels, extract_utterance_level)

# Save the resulting features to CSV without header and without index
if extract_utterance_level:
    train_features_df.to_csv(save_path + 'train_utterance_Wav2Vec2_1024.csv', index=False, header=False)
else:
    train_features_df.to_csv(save_path + 'train_segment_Wav2Vec2_1024.csv', index=False, header=False)

print("Feature extraction with Wav2Vec2-XLSR completed.")


In [None]:
import os
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm
import librosa
import torch
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler

# Initialize paths and parameters
model_name = "facebook/wav2vec2-large-xlsr-53"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name)

train_audio_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\train\\con_wav\\*.wav"
segment_label_path = r"F:\Awais_data\Datasets\PartialSpoof\database_segment_labels\database\segment_labels\train_seglab_0.16.npy"
utterance_label_path = r"F:\Awais_data\Datasets\PartialSpoof\protocols\PartialSpoof_LA_cm_protocols\PartialSpoof.LA.cm.train.trl.txt"
save_path = r"C:\\Notebooks\\rrl_source\\dataset_raw\\merge\\new\\"

window_size = 0.16  # Segment length in seconds
hop_size = 0.16     # Frame shift in seconds
extract_utterance_level = False  # Set to True for utterance-level, False for segment-level

# Hann window function
def hann_window(length):
    return 0.5 * (1 - np.cos(2 * np.pi * np.arange(length) / (length - 1)))

# Load segment-level or utterance-level labels
def load_labels(label_file, utterance_level):
    if utterance_level:
        labels = {}
        with open(label_file, 'r') as file:
            for line in file:
                parts = line.strip().split(' ')
                if len(parts) >= 5:
                    file_id = parts[1].strip()
                    label = parts[-1].strip()
                    labels[file_id] = 0 if label == 'spoof' else 1
    else:
        labels = np.load(label_file, allow_pickle=True).item()
    return labels

# Handcrafted feature extraction
def extract_handcrafted_features(segment, sr):
    windowed_segment = segment * hann_window(len(segment))
    mfcc = librosa.feature.mfcc(y=windowed_segment, sr=sr, n_mfcc=13).mean(axis=1)
    delta_mfcc = librosa.feature.delta(mfcc)
    tempo = librosa.beat.tempo(y=windowed_segment, sr=sr)[0]
    chroma = librosa.feature.chroma_stft(y=windowed_segment, sr=sr).mean(axis=1)
    zcr = librosa.feature.zero_crossing_rate(windowed_segment).mean()
    energy = librosa.feature.rms(y=windowed_segment).mean()
    pitches, _ = librosa.core.piptrack(y=windowed_segment, sr=sr)
    pitch = np.mean(pitches[pitches > 0]) if len(pitches[pitches > 0]) > 0 else 0
    tempogram = librosa.feature.tempogram(y=windowed_segment, sr=sr).mean(axis=1)
    downsampled_tempogram = tempogram[::int(np.ceil(len(tempogram) / 18))]
    features = np.concatenate((mfcc, delta_mfcc, [tempo], chroma, [zcr], [energy], [pitch], downsampled_tempogram))
    return features

def extract_wav2vec_features(segment, sr): 
    if len(segment.shape) > 1:   # Ensure the segment is a 1D array and not a 2D array
        segment = segment.flatten()  # Flatten it to 1D if needed
    
    inputs = feature_extractor(segment, sampling_rate=sr, return_tensors="pt", padding=False)
    
    with torch.no_grad():
        outputs = model(**inputs)  # Extract the last hidden state and return it
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()



# Main processing function
def process_dataset(audio_files, labels, utterance_level):
    all_features = []
    for audio_file in tqdm(audio_files, desc="Processing files"):
        file_id = os.path.basename(audio_file).replace('.wav', '')
        if file_id not in labels:
            continue

        y, sr = librosa.load(audio_file, sr=16000)
        if utterance_level:
            label = labels[file_id]
            handcrafted_features = extract_handcrafted_features(y, sr)
            wav2vec_features = extract_wav2vec_features(y, sr)
            combined_features = np.concatenate(([file_id], handcrafted_features, wav2vec_features, [label]))
            all_features.append(combined_features)
        else:
            segment_labels = labels[file_id]
            segment_length = int(window_size * sr)
            hop_length = int(hop_size * sr)
            for i, seg_label in enumerate(segment_labels):
                start = i * hop_length
                end = start + segment_length
                if end > len(y):
                    break
                segment = y[start:end]
                handcrafted_features = extract_handcrafted_features(segment, sr)
                wav2vec_features = extract_wav2vec_features(segment, sr)
                combined_features = np.concatenate(([file_id], handcrafted_features, wav2vec_features, [seg_label]))
                all_features.append(combined_features)

    return np.array(all_features)

# Load labels and process dataset
labels = load_labels(utterance_label_path if extract_utterance_level else segment_label_path, extract_utterance_level)
audio_files = glob(train_audio_path)
features = process_dataset(audio_files, labels, extract_utterance_level)

# Save features in parts
part_size = 80000
for i in range(0, len(features), part_size):
    part_features = features[i:i + part_size]
    part_name = f"{save_path}train_{'utterance' if extract_utterance_level else 'segment'}_merged_part_{i // part_size + 1}.csv"
    pd.DataFrame(part_features).to_csv(part_name, index=False)

In [None]:
import pandas as pd
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler

# Load the features file
features_file = f"{save_path}train_{'utterance' if extract_utterance_level else 'segment'}_merged_effective.csv"
data = pd.read_csv(features_file)

# Separate features and labels
features = data.iloc[:, :-1]  # All columns except the last one
labels = data.iloc[:, -1].astype(int)  # The last column is assumed to be the label

# Standardize features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Perform feature selection based on mutual information
mi_scores = mutual_info_classif(features_scaled, labels)
feature_importance = sorted(enumerate(mi_scores), key=lambda x: x[1], reverse=True)
selected_feature_indices = [idx for idx, score in feature_importance if score > 0.01]  # Adjust threshold as needed

# Extract effective features
effective_features = features.iloc[:, selected_feature_indices]

# Add the label column back to the effective features
effective_features['Label'] = labels.values

# Save the effective features to a new CSV file
effective_features_save_name = f"{save_path}effective_features.csv"
effective_features.to_csv(effective_features_save_name, index=False)

print("Effective features extraction and saving completed.")


In [None]:
# Correlation analysis and feature selection
def feature_selection(features, labels):
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)
    mi_scores = mutual_info_classif(features_scaled, labels)
    feature_importance = sorted(enumerate(mi_scores), key=lambda x: x[1], reverse=True)
    selected_features = [idx for idx, score in feature_importance if score > 0.01]  # Select features with significant MI
    return features[:, selected_features]

# Run feature selection on combined features
features_df = pd.DataFrame(features)
labels = features_df.iloc[:, -1].astype(int)
selected_features = feature_selection(features_df.iloc[:, 1:-1].values, labels)

# Create a DataFrame for selected features and add labels column
selected_features_df = pd.DataFrame(selected_features)
selected_features_df['Label'] = labels.values  # Add labels as the last column

# Save selected features with labels
effective_save_name = f"{save_path}train_{'utterance' if extract_utterance_level else 'segment'}_merged_effective.csv"
selected_features_df.to_csv(effective_save_name, index=False)

print("Feature extraction, saving, and selection completed.")


In [2]:
# Correlation analysis and feature selection
def feature_selection(features, labels):
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)
    mi_scores = mutual_info_classif(features_scaled, labels)
    feature_importance = sorted(enumerate(mi_scores), key=lambda x: x[1], reverse=True)
    selected_features = [idx for idx, score in feature_importance if score > 0.01]  # Select features with significant MI
    return features[:, selected_features]

# Run feature selection on combined features
features_df = pd.DataFrame(features)
labels = features_df.iloc[:, -1].astype(int)
selected_features = feature_selection(features_df.iloc[:, 1:-1].values, labels)

# Save selected features
effective_save_name = f"{save_path}train_{'utterance' if extract_utterance_level else 'segment'}_merged_effective.csv"
pd.DataFrame(selected_features).to_csv(effective_save_name, index=False)

print("Feature extraction, saving, and selection completed.")

Feature extraction, saving, and selection completed.


Frame level Classification

In [10]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

# Define a simple frame-level classifier (e.g., a small MLP)
class FrameLevelClassifier(nn.Module):
    def __init__(self, input_dim):
        super(FrameLevelClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 64)
        self.relu2 = nn.ReLU()
        self.output_layer = nn.Linear(64, 1)  # Binary classification: output 1 score per frame

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.output_layer(x)
        return torch.sigmoid(x).squeeze()  # Sigmoid activation for binary classification

# Example function to classify and perform max pooling
def classify_and_evaluate(features, labels, utterance_level, model):
    model.eval()
    all_frame_scores = []
    all_final_labels = []
    all_frame_labels = []  # To store individual frame labels for each audio

    for feature_set, label_set in zip(features, labels):
        # Convert features to tensor
        inputs = torch.tensor(feature_set, dtype=torch.float32)
        with torch.no_grad():
            frame_scores = model(inputs).numpy()  # Frame-level scores

        all_frame_scores.append(frame_scores)

        # Apply max pooling logic for utterance-level classification
        if utterance_level:
            pooled_label = 1 if np.sum(frame_scores > 0.5) > 1 else 0
            all_final_labels.append(pooled_label)
        else:
            # Frame-level labels (using 0.5 as threshold for binary classification)
            frame_labels = (frame_scores > 0.5).astype(int)
            all_final_labels.extend(frame_labels)
            all_frame_labels.append(frame_labels)  # Save frame-level labels

    return all_frame_scores, all_final_labels, all_frame_labels

# Dummy Data Example
features = [np.random.rand(50, 1024) for _ in range(10)]  # 10 audio segments with 50 frames, 20 features each
labels = [np.random.randint(0, 2, size=50) for _ in range(10)]  # Random binary labels for frames

# Initialize and train a classifier (example training loop)
input_dim = 1024  # Example feature dimension
model = FrameLevelClassifier(input_dim)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Example training loop (adjust with real data)
for epoch in range(10):
    model.train()
    for feature_set, label_set in zip(features, labels):
        inputs = torch.tensor(feature_set, dtype=torch.float32)
        targets = torch.tensor(label_set, dtype=torch.float32)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

# Classify and evaluate
utterance_level = False  # Toggle for utterance or segment-level evaluation
frame_scores, final_labels, frame_labels = classify_and_evaluate(features, labels, utterance_level, model)

# Define the path to save the results
results_path = './results.txt'

# Store results in a file
with open(results_path, 'w') as file:
    for i, (scores, final_label, frame_label) in enumerate(zip(frame_scores, final_labels, frame_labels)):
        file.write(f"Audio {i+1}:\n")
        file.write(f"Predicted Frame-Level Labels: {frame_label}\n")
        file.write(f"Aggregated Utterance-Level Label: {final_label}\n")
        file.write(f"Frame Scores: {scores}\n\n")

# Display results in the console
for i, (scores, final_label, frame_label) in enumerate(zip(frame_scores, final_labels, frame_labels)):
    print(f"Audio {i+1}:")
    print(f"Predicted Frame-Level Labels: {frame_label}")
    print(f"Aggregated Utterance-Level Label: {final_label}")
    print(f"Frame Scores: {scores}\n")

# Compute metrics (example for pooled results)
if utterance_level:
    y_true = [np.sum(l) > 0 for l in labels]  # Assuming ground truth for utterance-level
    # Check for class imbalance in y_true and final_labels before computing AUC
    if len(np.unique(y_true)) > 1 and len(np.unique(final_labels)) > 1:
        auc_score = roc_auc_score(y_true, final_labels)
        print(f"AUC Score (Utterance-Level): {auc_score}")
    else:
        print("AUC Score is not defined as only one class is present in y_true or final_labels.")
        auc_score = None  # Or handle as needed

    # Confusion Matrix Example
    cm = confusion_matrix(y_true, final_labels)
    print(f"Confusion Matrix:\n{cm}")
else:
    # If using frame-level evaluation, print accuracy and confusion matrix
    y_true_frame = [frame_label for label_set in labels for frame_label in label_set]
    frame_labels = [frame_label for frame_scores in final_labels for frame_label in (np.array(frame_scores) > 0.5).astype(int)]
    
    accuracy = accuracy_score(y_true_frame, frame_labels)
    print(f"Accuracy (Frame-Level): {accuracy}")
    
    cm = confusion_matrix(y_true_frame, frame_labels)
    print(f"Confusion Matrix (Frame-Level):\n{cm}")


Audio 1:
Predicted Frame-Level Labels: [1 0 1 0 1 0 0 0 0 0 1 1 0 1 1 0 0 0 1 1 0 1 1 1 0 1 1 0 0 1 0 1 0 1 0 0 1
 0 1 0 0 0 1 0 1 0 1 0 1 0]
Aggregated Utterance-Level Label: 1
Frame Scores: [0.51651025 0.19683348 0.5832396  0.36314127 0.792523   0.31978112
 0.32403266 0.19865608 0.47405243 0.28671148 0.57941854 0.5465663
 0.41384107 0.80777293 0.63284963 0.27037117 0.26781967 0.4482317
 0.5572884  0.67634547 0.24657296 0.6267955  0.7551319  0.61662495
 0.4356546  0.60165143 0.58016634 0.26444578 0.32016668 0.7135902
 0.388962   0.78363305 0.39304635 0.65364254 0.14468166 0.38235855
 0.5759792  0.17293951 0.6297013  0.35401466 0.2188062  0.37123403
 0.7639302  0.4092591  0.7939836  0.34155843 0.58516186 0.38466197
 0.65076846 0.34097603]

Audio 2:
Predicted Frame-Level Labels: [0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 1 1 0 0 1 1 0 0 0 1 0 1 1 1 0 1 1 0 0 1 1 0
 1 0 0 1 0 0 1 0 0 1 0 1 0]
Aggregated Utterance-Level Label: 0
Frame Scores: [0.25908995 0.23540835 0.24152315 0.46822485 0.50066394 0.

TypeError: 'numpy.int32' object is not iterable

In [6]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

# Read features and labels from CSV file
# def load_data_from_csv(file_path): # Load CSV file
#     df = pd.read_csv(file_path)# Extract features (excluding 'FileID', 'Polarity', and 'Label')
#     features = df.iloc[:, 1:-2].values  # Columns 2 to second-last (1024-dimensional features) # Extract labels (second-last column)
#     labels = df['Label'].values
#     return features, labels

def load_data_from_csv(file_path, samples_per_class=4000):# Load CSV file
    df = pd.read_csv(file_path) # Group by 'Label' and select only the first 'samples_per_class' rows from each group
    df_filtered = df.groupby('Label').apply(lambda x: x.head(samples_per_class)).reset_index(drop=True)# Extract features (excluding 'FileID', 'Polarity', and 'Label')
    features = df_filtered.iloc[:, 1:-2].values  # Columns 2 to second-last (1024-dimensional features)# Extract labels (second-last column)
    labels = df_filtered['Label'].values
    
    return features, labels

# Define a simple frame-level classifier (e.g., a small MLP)
class FrameLevelClassifier(nn.Module):
    def __init__(self, input_dim):
        super(FrameLevelClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 64)
        self.relu2 = nn.ReLU()
        self.output_layer = nn.Linear(64, 1)  # Binary classification: output 1 score per frame

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.output_layer(x)
        return torch.sigmoid(x)  # Ensure the output shape is (batch_size, 1)

# Function to classify and evaluate
def classify_and_evaluate(features, labels, utterance_level, model):
    model.eval()
    all_frame_scores = []
    all_final_labels = []

    for feature_set, label in zip(features, labels):
        # Convert features to tensor
        inputs = torch.tensor(feature_set, dtype=torch.float32).unsqueeze(0)  # Add batch dimension
        
        with torch.no_grad():
            frame_scores = model(inputs).numpy()  # Frame-level scores

        all_frame_scores.append(frame_scores)

        # Apply max pooling logic for utterance-level classification
        if utterance_level:
            pooled_label = 1 if np.sum(frame_scores > 0.5) > 1 else 0
            all_final_labels.append(pooled_label)
        else:
            frame_labels = (frame_scores > 0.5).astype(int)
            all_final_labels.append(frame_labels)

    return all_frame_scores, all_final_labels

# Load data from CSV
file_path = r'C:\Notebooks\rrl_source\dataset_raw\train_segment_Wav2Vec2_part1.csv'  # Update this path
features, labels = load_data_from_csv(file_path)

# Initialize and train a classifier
input_dim = 1024  # Feature dimension
model = FrameLevelClassifier(input_dim)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Convert labels to tensors
labels_tensor = torch.tensor(labels, dtype=torch.float32).unsqueeze(1)  # Add batch dimension

# Training loop
for epoch in range(10):
    model.train()
    for i in range(len(features)):
        feature_set = features[i].reshape(1, -1)  # Reshape for single input
        inputs = torch.tensor(feature_set, dtype=torch.float32)
        targets = labels_tensor[i].unsqueeze(0)  # Add batch dimension

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)  # Loss should work now since shapes match
        loss.backward()
        optimizer.step()

# Classify and evaluate
utterance_level = False  # Toggle for utterance or segment-level evaluation
frame_scores, final_labels = classify_and_evaluate(features, labels, utterance_level, model)

# Display results
for i, (scores, final_label) in enumerate(zip(frame_scores, final_labels)):
    print(f"Audio {i+1}:")
    print(f"Predicted Frame-Level Labels: {final_label}")
    print(f"Frame Scores: {scores}\n")

# Compute metrics
y_true = labels  # Assuming ground truth from the CSV
if utterance_level:
    y_pred = final_labels
    if len(np.unique(y_true)) > 1 and len(np.unique(y_pred)) > 1:
        auc_score = roc_auc_score(y_true, y_pred)
        print(f"AUC Score (Utterance-Level): {auc_score}")
    else:
        print("AUC Score is not defined due to lack of class variance.")
    cm = confusion_matrix(y_true, y_pred)
    print(f"Confusion Matrix:\n{cm}")
else:
    accuracy = accuracy_score(y_true, np.round(final_labels))
    print(f"Accuracy (Frame-Level): {accuracy}")
    cm = confusion_matrix(y_true, np.round(final_labels))
    print(f"Confusion Matrix (Frame-Level):\n{cm}")


# Load data from CSV
# file_path = r'C:\Notebooks\rrl_source\dataset_raw\train_segment_Wav2Vec2_part1.csv'  # Update this path



  df_filtered = df.groupby('Label').apply(lambda x: x.head(samples_per_class)).reset_index(drop=True)# Extract features (excluding 'FileID', 'Polarity', and 'Label')


Audio 1:
Predicted Frame-Level Labels: [[1]]
Frame Scores: [[0.9995407]]

Audio 2:
Predicted Frame-Level Labels: [[1]]
Frame Scores: [[0.9995407]]

Audio 3:
Predicted Frame-Level Labels: [[1]]
Frame Scores: [[0.9995407]]

Audio 4:
Predicted Frame-Level Labels: [[1]]
Frame Scores: [[0.9995407]]

Audio 5:
Predicted Frame-Level Labels: [[1]]
Frame Scores: [[0.9995407]]

Audio 6:
Predicted Frame-Level Labels: [[1]]
Frame Scores: [[0.9995407]]

Audio 7:
Predicted Frame-Level Labels: [[1]]
Frame Scores: [[0.9995407]]

Audio 8:
Predicted Frame-Level Labels: [[1]]
Frame Scores: [[0.9995407]]

Audio 9:
Predicted Frame-Level Labels: [[1]]
Frame Scores: [[0.9995407]]

Audio 10:
Predicted Frame-Level Labels: [[1]]
Frame Scores: [[0.9995407]]

Audio 11:
Predicted Frame-Level Labels: [[1]]
Frame Scores: [[0.9995407]]

Audio 12:
Predicted Frame-Level Labels: [[1]]
Frame Scores: [[0.9995407]]

Audio 13:
Predicted Frame-Level Labels: [[1]]
Frame Scores: [[0.9995407]]

Audio 14:
Predicted Frame-Level La

ValueError: Classification metrics can't handle a mix of binary and unknown targets

In [None]:
import os
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm
import librosa
import torch
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Initialize paths and parameters
model_name = "facebook/wav2vec2-large-xlsr-53"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name).to(device)

train_audio_path = r"F:\\Awais_data\\Datasets\\PartialSpoof\\train\\con_wav\\*.wav"
segment_label_path = r"F:\Awais_data\Datasets\PartialSpoof\database_segment_labels\database\segment_labels\train_seglab_0.16.npy"
utterance_label_path = r"F:\Awais_data\Datasets\PartialSpoof\protocols\PartialSpoof_LA_cm_protocols\PartialSpoof.LA.cm.train.trl.txt"
save_path = r"C:\\Notebooks\\rrl_source\\dataset_raw\\merge\\new2\\"

window_size = 0.16  # Segment length in seconds
hop_size = 0.16     # Frame shift in seconds
extract_utterance_level = False  # Set to True for utterance-level, False for segment-level

# Hann window function
def hann_window(length):
    return 0.5 * (1 - np.cos(2 * np.pi * np.arange(length) / (length - 1)))

# Load segment-level or utterance-level labels
def load_labels(label_file, utterance_level):
    if utterance_level:
        labels = {}
        with open(label_file, 'r') as file:
            for line in file:
                parts = line.strip().split(' ')
                if len(parts) >= 5:
                    file_id = parts[1].strip()
                    label = parts[-1].strip()
                    labels[file_id] = 0 if label == 'spoof' else 1
    else:
        labels = np.load(label_file, allow_pickle=True).item()
    return labels

# Handcrafted feature extraction
def extract_handcrafted_features(segment, sr, target_size=60):
    windowed_segment = segment * hann_window(len(segment))
    mfcc = librosa.feature.mfcc(y=windowed_segment, sr=sr, n_mfcc=13).mean(axis=1)
    delta_mfcc = librosa.feature.delta(mfcc)
    tempo = librosa.beat.tempo(y=windowed_segment, sr=sr)[0]
    chroma = librosa.feature.chroma_stft(y=windowed_segment, sr=sr).mean(axis=1)
    zcr = librosa.feature.zero_crossing_rate(windowed_segment).mean()
    energy = librosa.feature.rms(y=windowed_segment).mean()
    pitches, _ = librosa.core.piptrack(y=windowed_segment, sr=sr)
    pitch = np.mean(pitches[pitches > 0]) if len(pitches[pitches > 0]) > 0 else 0
    tempogram = librosa.feature.tempogram(y=windowed_segment, sr=sr).mean(axis=1)
    downsampled_tempogram = tempogram[::int(np.ceil(len(tempogram) / 18))]
    features = np.concatenate((mfcc, delta_mfcc, [tempo], chroma, [zcr], [energy], [pitch], downsampled_tempogram))  # Pad or truncate features to the target size
    if len(features) < target_size:
        features = np.pad(features, (0, target_size - len(features)), mode='constant')
    elif len(features) > target_size:
        features = features[:target_size]
    return features

def extract_wav2vec_features(segment, sr, target_size=1024): 
    if len(segment.shape) > 1:   # Ensure the segment is a 1D array and not a 2D array
        segment = segment.flatten()  # Flatten it to 1D if needed  
    inputs = feature_extractor(segment, sampling_rate=sr, return_tensors="pt", padding=False).to(device)
    with torch.no_grad():
        outputs = model(**inputs)  # Extract the last hidden state and return it
    wav2vec_features =  outputs.last_hidden_state.mean(dim=1).cpu().squeeze().numpy()
    # Pad or truncate to fixed size (1024)
    if len(wav2vec_features) < target_size:
        wav2vec_features = np.pad(wav2vec_features, (0, target_size - len(wav2vec_features)), mode='constant')
    elif len(wav2vec_features) > target_size:
        wav2vec_features = wav2vec_features[:target_size]  
    return wav2vec_features


# Main processing function
def process_dataset(audio_files, labels, utterance_level):
    all_features = []
    balanced_audio_0 = []
    balanced_audio_1 = []
    
    for audio_file in tqdm(audio_files, desc="Processing files"):
        file_id = os.path.basename(audio_file).replace('.wav', '')
        if file_id not in labels:
            continue

        y, sr = librosa.load(audio_file, sr=16000)
        if utterance_level:
            label = labels[file_id]
            handcrafted_features = extract_handcrafted_features(y, sr)
            wav2vec_features = extract_wav2vec_features(y, sr)
            combined_features = np.concatenate(([file_id], handcrafted_features, wav2vec_features, [label]))
            all_features.append(combined_features)
        else:
            segment_labels = labels[file_id]
            # Check if all segment labels are 0 or all 1
            if np.all(segment_labels == 0):
                print(f"All labels for {file_id} are 0, treating as a single segment with label 0.")
                label = 0
                handcrafted_features = extract_handcrafted_features(y, sr)
                wav2vec_features = extract_wav2vec_features(y, sr)
                combined_features = np.concatenate(([file_id], handcrafted_features, wav2vec_features, [label]))
                balanced_audio_0.append(combined_features)
            elif np.all(segment_labels == 1):
                print(f"All labels for {file_id} are 1, treating as a single segment with label 1.")
                label = 1
                handcrafted_features = extract_handcrafted_features(y, sr)
                wav2vec_features = extract_wav2vec_features(y, sr)
                combined_features = np.concatenate(([file_id], handcrafted_features, wav2vec_features, [label]))
                balanced_audio_1.append(combined_features)
            else:
                # For mixed labels, process as usual
                segment_length = int(window_size * sr)
                hop_length = int(hop_size * sr)
                for i, seg_label in enumerate(segment_labels):
                    start = i * hop_length
                    end = start + segment_length
                    if end > len(y):
                        break
                    segment = y[start:end]
                    handcrafted_features = extract_handcrafted_features(segment, sr)
                    wav2vec_features = extract_wav2vec_features(segment, sr)
                    combined_features = np.concatenate(([file_id], handcrafted_features, wav2vec_features, [seg_label]))
                    all_features.append(combined_features)
    
    # Balancing the dataset
    num_0 = len(balanced_audio_0)
    num_1 = len(balanced_audio_1)
    min_len = min(num_0, num_1)
    
    balanced_audio_0 = balanced_audio_0[:min_len]
    balanced_audio_1 = balanced_audio_1[:min_len]
    
    all_features = balanced_audio_0 + balanced_audio_1 + all_features
    
    return np.array(all_features)

# Load labels and process dataset
labels = load_labels(utterance_label_path if extract_utterance_level else segment_label_path, extract_utterance_level)
audio_files = glob(train_audio_path)
features = process_dataset(audio_files, labels, extract_utterance_level)

# Save features in parts
part_size = 80000
for i in range(0, len(features), part_size):
    part_features = features[i:i + part_size]
    part_name = f"{save_path}train_{'utterance' if extract_utterance_level else 'segment'}_merged_part_{i // part_size + 1}.csv"
    pd.DataFrame(part_features).to_csv(part_name, index=False)


In [22]:
import os
import pandas as pd

save_path2=r'C:\\Notebooks\\rrl_source\\dataset_raw\\merge\\new\\Merged_feature_with_case1\\'
# Ensure save_path ends with a separator and the directory exists
if not save_path2.endswith(("/", "\\")):
    save_path += os.sep

if not os.path.exists(save_path2):
    os.makedirs(save_path2)

# Save features in chunks
part_size = 80000
for i in range(0, len(features), part_size):
    part_features = features[i:i + part_size]
    part_name = f"{save_path2}train_{'utterance' if extract_utterance_level else 'segment'}_merged_part_{i // part_size + 1}.csv"
    pd.DataFrame(part_features).to_csv(part_name, index=False)


In [None]:
features.shape